  # Challenge

  ## Identifying Outliers using Standard Deviation

In [2]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine



In [3]:
# create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/credit_transactions_db")

#Get transaction info, as well as customer and merchant
qry = """
    SELECT t.transaction_id, t.transaction_date, t.amount, ca.card_id, cu.customer_id, cu.full_name, t.merchant_id
    FROM \"Transactions\" as t
    JOIN \"Cards\" as ca ON ca.card_id = t.card_id
    JOIN \"Customers\" as cu ON ca.customer_id = cu.customer_id;

"""

df = pd.read_sql(qry, engine)

In [4]:
# code a function to identify outliers based on standard deviation


# #To get the mean
def get_mean(df):
    df_mean = df["amount"].mean()
    return df_mean

 #To get the st dev
def get_std(df):
    df_std = df["amount"].std()
    
    return df_std

# To get outliers above the st dev, with an option to change sensitivity in terms of st devs, 1 is default
# Note - only checks in 1 direction since std > mean, and values must be positive

def get_outliers(df, num_std = 1):
    df_ol = df[df["amount"] > get_mean(df) + (get_std(df)*num_std)]
    return df_ol

# #Test
# #m = get_mean(df) -> 40.79
# #s = get_std(df)# -> 202.04

                                              
df_ol = get_outliers(df,2)
df_ol.head(10)


Unnamed: 0,transaction_id,transaction_date,amount,card_id,customer_id,full_name,merchant_id
15,99,2018-01-02 23:27:46,1031.0,501879657465,12,Megan Price,95
27,2650,2018-01-04 03:05:18,1685.0,3516952396080247,7,Sean Taylor,80
62,1291,2018-01-08 02:34:32,1029.0,3581345943543942,6,Beth Hernandez,145
212,1442,2018-01-22 08:07:03,1131.0,5570600642865857,16,Crystal Clark,144
219,2667,2018-01-23 06:29:37,1678.0,501879657465,12,Megan Price,92
235,2913,2018-01-24 13:17:19,1691.0,4761049645711555811,1,Robert Johnson,14
296,1415,2018-01-30 18:31:00,1177.0,4319653513507,25,Nancy Contreras,64
379,2409,2018-02-09 11:38:37,445.0,3516952396080247,7,Sean Taylor,112
457,2699,2018-02-17 01:27:19,1430.0,5570600642865857,16,Crystal Clark,71
484,1480,2018-02-19 16:00:43,1072.0,3516952396080247,7,Sean Taylor,49


In [29]:
# find anomalous transactions for 3 random card holders

#Get 3 random customer Id's, conditionals added to reduce chance of duplicates since list is fairly small

rand_id1 = np.random.randint(1,25)
rand_id2 = np.random.randint(1,25)
if rand_id2 == rand_id1:
    rand_id2 = np.random.randint(1,25)
rand_id3 = np.random.randint(1,25)
if rand_id3 == rand_id1 | rand_id2:
    rand_id3 == np.random.randint(1,25)
    

print(rand_id1,rand_id2,rand_id3)
df_trans = {} #initialize dictionary that will combine transactions of all 3 random customers

#df_trans1 = df[df["customer_id"] == rand_id1 ]
#df_trans2 = df[df["customer_id"] == rand_id2 ]
#df_trans3 = df[df["customer_id"] == rand_id3 ]

df_trans = df[df["customer_id"].isin([rand_id1,rand_id2,rand_id3])]
df_trans



#combine all 3 sets of transactions into 1 DF
#df_trans = pd.concat([df_trans1,df_trans2,df_trans3], join = "inner", axis = 0)

#Merge/Join against outliers generated above.  Any resultant transaction will be included in both lists
df_anom = df_trans.merge(df_ol,how = "inner", on = "transaction_id" )
df_anom
#df_trans2

8 16 3


Unnamed: 0,transaction_id,transaction_date_x,amount_x,card_id_x,customer_id_x,full_name_x,merchant_id_x,transaction_date_y,amount_y,card_id_y,customer_id_y,full_name_y,merchant_id_y
0,1442,2018-01-22 08:07:03,1131.0,5570600642865857,16,Crystal Clark,144,2018-01-22 08:07:03,1131.0,5570600642865857,16,Crystal Clark,144
1,2699,2018-02-17 01:27:19,1430.0,5570600642865857,16,Crystal Clark,71,2018-02-17 01:27:19,1430.0,5570600642865857,16,Crystal Clark,71
2,1005,2018-03-01 21:29:05,1119.0,30078299053512,3,Elizabeth Sawyer,19,2018-03-01 21:29:05,1119.0,30078299053512,3,Elizabeth Sawyer,19
3,2451,2018-03-05 08:26:08,1617.0,5570600642865857,16,Crystal Clark,4,2018-03-05 08:26:08,1617.0,5570600642865857,16,Crystal Clark,4
4,1757,2018-05-29 02:55:08,1203.0,5570600642865857,16,Crystal Clark,62,2018-05-29 02:55:08,1203.0,5570600642865857,16,Crystal Clark,62
5,1191,2018-06-17 15:59:45,1103.0,5570600642865857,16,Crystal Clark,23,2018-06-17 15:59:45,1103.0,5570600642865857,16,Crystal Clark,23
6,1334,2018-07-11 16:55:22,1159.0,30078299053512,3,Elizabeth Sawyer,107,2018-07-11 16:55:22,1159.0,30078299053512,3,Elizabeth Sawyer,107
7,1349,2018-07-14 06:09:18,1160.0,30078299053512,3,Elizabeth Sawyer,136,2018-07-14 06:09:18,1160.0,30078299053512,3,Elizabeth Sawyer,136
8,2508,2018-07-26 23:02:51,1803.0,5570600642865857,16,Crystal Clark,68,2018-07-26 23:02:51,1803.0,5570600642865857,16,Crystal Clark,68
9,16,2018-09-09 03:39:06,626.0,30078299053512,3,Elizabeth Sawyer,135,2018-09-09 03:39:06,626.0,30078299053512,3,Elizabeth Sawyer,135


  ## Identifying Outliers Using Interquartile Range

In [32]:
# code a function to identify outliers based on interquartile range
'''
Not sure if this is quite right.  I set up the quartiles, then queried the upper and lower bounds
'''

q1 = df["amount"].quantile(0.25)
#q2 = df["amount"].quantile(0.50)
q3 = df["amount"].quantile(0.75)

iqr = q3 - q1
iqr_lb = iqr - (1.5 * iqr) #lower bound
iqr_ub = iqr + (1.5 * iqr) #upper bound

#very low transaction amounts, lower bound could be negative
if iqr_lb < 0:
    iqr_lb = 0

df_iqr = df_iqr.query(f'@iqr_lb <= amount <= @iqr_ub')
df_iqr.head()


10.912500000000001 27.281250000000004 0


Unnamed: 0,transaction_id,transaction_date,amount,card_id,customer_id,full_name,merchant_id
0,222,2018-01-01 21:35:10,6.22,3561954487988605,13,John Martin,69
1,2045,2018-01-01 21:43:12,3.83,5135837688671496,13,John Martin,85
2,395,2018-01-01 22:41:21,9.61,213193946980303,10,Matthew Gutierrez,82
3,3309,2018-01-01 23:13:30,19.03,4263694062533017,4,Danielle Green,5
4,567,2018-01-01 23:15:10,2.95,4498002758300,18,Malik Carlson,64


In [36]:
# find anomalous transactions for 3 random card holders
rand_id1 = np.random.randint(0,25)
rand_id2 = np.random.randint(0,25)
if rand_id2 == rand_id1:
    rand_id2 = np.random.randint(0,25)
rand_id3 = np.random.randint(0,25)
if rand_id3 == rand_id1 | rand_id2:
    rand_id3 == np.random.randint(0,25)
    
df_trans_iqr = df[df["customer_id"].isin([rand_id1,rand_id2,rand_id3])]

#Merge/Join against outliers generated above. Will need to filter by NaNs to get those NOT in the middle 50%, to indicate they are outliers
df_trans_iqr = df_trans.merge(df_iqr,how = "left", on = "transaction_id" )

#Perhaps not the best way, but seems to work.  Returns those transactions from 3 customers where transactions are not in the middle 50%
is_nan = df_trans_iqr.isnull()
row_nan = is_nan.any(axis=1)
df_iqr_filtered = df_trans_iqr[row_nan]
df_iqr_filtered

Unnamed: 0,transaction_id,transaction_date_x,amount_x,card_id_x,customer_id_x,full_name_x,merchant_id_x,transaction_date_y,amount_y,card_id_y,customer_id_y,full_name_y,merchant_id_y
386,2650,2018-01-04 03:05:18,1685.0,3516952396080247,7,Sean Taylor,80,NaT,,,,,
397,2409,2018-02-09 11:38:37,445.0,3516952396080247,7,Sean Taylor,112,NaT,,,,,
404,1480,2018-02-19 16:00:43,1072.0,3516952396080247,7,Sean Taylor,49,NaT,,,,,
415,408,2018-04-10 06:08:01,543.0,3516952396080247,7,Sean Taylor,63,NaT,,,,,
417,1827,2018-04-18 23:23:29,1086.0,3516952396080247,7,Sean Taylor,26,NaT,,,,,
435,3377,2018-05-19 19:33:38,160.0,3516952396080247,7,Sean Taylor,24,NaT,,,,,
462,721,2018-07-17 10:11:12,233.0,3516952396080247,7,Sean Taylor,73,NaT,,,,,
473,1424,2018-08-07 11:07:32,1449.0,3516952396080247,7,Sean Taylor,61,NaT,,,,,
513,2945,2018-12-13 15:51:59,2249.0,3516952396080247,7,Sean Taylor,83,NaT,,,,,
518,1318,2018-12-18 17:20:33,1296.0,3516952396080247,7,Sean Taylor,87,NaT,,,,,
