In [1]:
### Fraud Detection Model

### this system will be built based on:

#### 1. Transaction-level features → amt, category, trans_hour

#### 2. Geo features → distance_from_home, distance_from_last_transaction

#### 3. Behavioral features → transaction_velocity, amt_deviation, time_since_last_transaction

#### 4. Time flags → is_night, is_weekend

In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from geopy.distance import geodesic

In [3]:
#import the dataset again

merchant_df = pd.read_csv('fraudTest.csv')

In [4]:
merchant_df.head(2)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0


In [5]:
merchant_df.drop("Unnamed: 0", inplace=True, axis=1)

In [6]:
merchant_df.head(2)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0


In [7]:
#convert trans_date_trans_time to datetime format from object data types

merchant_df['trans_date_trans_time'] = pd.to_datetime(merchant_df['trans_date_trans_time'])

In [8]:
# Extracting year, month, and day from trans_date_trans_time

merchant_df['trans_year'] = merchant_df['trans_date_trans_time'].dt.year
merchant_df['trans_month'] = merchant_df['trans_date_trans_time'].dt.month
merchant_df['trans_day'] = merchant_df['trans_date_trans_time'].dt.weekday
merchant_df['trans_hour'] = merchant_df['trans_date_trans_time'].dt.hour

In [9]:
merchant_df['trans_hour'].unique()

array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,
        5,  6,  7,  8,  9, 10, 11])

In [10]:
merchant_df['trans_day'].unique()

array([6, 0, 1, 2, 3, 4, 5])

In [11]:
# Time-based features
#saturday and sunday = 0 and other day = 1
# between 12 mid-night to 5am = 1 and other hours = 0

merchant_df['is_night'] = merchant_df['trans_hour'].apply(lambda x: 1 if (x >= 0 and x <= 5) else 0)
merchant_df['is_weekend'] = merchant_df['trans_day'].apply(lambda x: 1 if x >= 5 else 0)

In [24]:
merchant_df.head(10)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,is_fraud,trans_year,trans_month,trans_day,trans_hour,is_night,is_weekend,distance_km,prev_trans_time,time_diff_sec
157,2020-06-21 13:05:42,60416207185,fraud_Kutch-Ferry,home,124.66,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,6,13,0,1,30.533617,NaT,
741,2020-06-21 16:25:36,60416207185,fraud_Halvorson Group,misc_pos,78.52,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,6,16,0,1,91.864216,2020-06-21 13:05:42,11994.0
3047,2020-06-22 07:58:33,60416207185,fraud_Conroy-Cruickshank,gas_transport,65.25,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,0,7,0,0,121.877934,2020-06-21 16:25:36,55977.0
4351,2020-06-22 15:32:31,60416207185,fraud_Larkin Ltd,kids_pets,87.74,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,0,15,0,0,65.393092,2020-06-22 07:58:33,27238.0
7695,2020-06-23 12:28:54,60416207185,fraud_Leffler-Goldner,personal_care,148.02,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,1,12,0,0,38.39478,2020-06-22 15:32:31,75383.0
8059,2020-06-23 14:24:48,60416207185,"fraud_Kihn, Abernathy and Douglas",shopping_net,2.83,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,1,14,0,0,62.145982,2020-06-23 12:28:54,6954.0
8518,2020-06-23 16:39:40,60416207185,"fraud_Altenwerth, Cartwright and Koss",shopping_net,7.26,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,1,16,0,0,103.541507,2020-06-23 14:24:48,8092.0
9059,2020-06-23 19:07:05,60416207185,fraud_Cartwright PLC,kids_pets,6.17,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,1,19,0,0,81.54499,2020-06-23 16:39:40,8845.0
9771,2020-06-23 22:45:57,60416207185,"fraud_Ritchie, Oberbrunner and Cremin",travel,5.39,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,1,22,0,0,84.558364,2020-06-23 19:07:05,13132.0
10266,2020-06-24 04:22:17,60416207185,fraud_Koss and Sons,gas_transport,46.95,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,2020,6,2,4,1,0,116.453523,2020-06-23 22:45:57,20180.0


In [13]:
#Transaction-level features → amt, category, trans_hour

merchant_df['category'].unique()

array(['personal_care', 'health_fitness', 'misc_pos', 'travel',
       'kids_pets', 'shopping_pos', 'food_dining', 'home',
       'entertainment', 'shopping_net', 'misc_net', 'grocery_pos',
       'gas_transport', 'grocery_net'], dtype=object)

In [27]:
# First, ensure the datetime column is in datetime format
merchant_df['trans_date_trans_time'] = pd.to_datetime(merchant_df['trans_date_trans_time'])

# Sort values so rolling works correctly
merchant_df = merchant_df.sort_values(['cc_num', 'trans_date_trans_time'])

# Calculate rolling counts per card
merchant_df['velocity_1hr'] = (
    merchant_df
    .groupby('cc_num')
    .rolling('1H', on='trans_date_trans_time')
    .trans_date_trans_time.count()
    .reset_index(drop=True)
)


In [28]:
merchant_df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,trans_year,trans_month,trans_day,trans_hour,is_night,is_weekend,distance_km,prev_trans_time,time_diff_sec,velocity_1hr
157,2020-06-21 13:05:42,60416207185,fraud_Kutch-Ferry,home,124.66,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,2020,6,6,13,0,1,30.533617,NaT,,1.0
741,2020-06-21 16:25:36,60416207185,fraud_Halvorson Group,misc_pos,78.52,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,2020,6,6,16,0,1,91.864216,2020-06-21 13:05:42,11994.0,1.0
3047,2020-06-22 07:58:33,60416207185,fraud_Conroy-Cruickshank,gas_transport,65.25,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,2020,6,0,7,0,0,121.877934,2020-06-21 16:25:36,55977.0,1.0
4351,2020-06-22 15:32:31,60416207185,fraud_Larkin Ltd,kids_pets,87.74,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,2020,6,0,15,0,0,65.393092,2020-06-22 07:58:33,27238.0,1.0
7695,2020-06-23 12:28:54,60416207185,fraud_Leffler-Goldner,personal_care,148.02,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,2020,6,1,12,0,0,38.39478,2020-06-22 15:32:31,75383.0,1.0


In [14]:
# Geo-distance between customer and merchant

merchant_df['distance_km'] = merchant_df.apply(lambda row: geodesic(
    (row['lat'], row['long']),
    (row['merch_lat'], row['merch_long'])
).km, axis=1)

In [17]:
merchant_df.head(2)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,merch_lat,merch_long,is_fraud,trans_year,trans_month,trans_day,trans_hour,is_night,is_weekend,distance_km
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.986391,-81.200714,0,2020,6,6,12,0,1,24.613746
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,39.450498,-109.960431,0,2020,6,6,12,0,1,104.834043


In [29]:
merchant_df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'trans_year', 'trans_month', 'trans_day',
       'trans_hour', 'is_night', 'is_weekend', 'distance_km',
       'prev_trans_time', 'time_diff_sec', 'velocity_1hr'],
      dtype='object')

In [30]:
# Distance from last transaction

merchant_df = merchant_df.sort_values(['cc_num', 'trans_date_trans_time'])

merchant_df['prev_merch_lat'] = merchant_df.groupby('cc_num')['merch_lat'].shift()
merchant_df['prev_merch_long'] = merchant_df.groupby('cc_num')['merch_long'].shift()

merchant_df['distance_from_last_transaction'] = merchant_df.apply(
    lambda row: geodesic(
        (row['prev_merch_lat'], row['prev_merch_long']),
        (row['merch_lat'], row['merch_long'])
    ).km if pd.notnull(row['prev_merch_lat']) else 0,  # 0 for first transaction
    axis=1
)

In [35]:
merchant_df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'trans_year', 'trans_month', 'trans_day',
       'trans_hour', 'is_night', 'is_weekend', 'distance_km',
       'prev_trans_time', 'time_diff_sec', 'velocity_1hr', 'prev_merch_lat',
       'prev_merch_long', 'distance_from_last_transaction', 'amt_deviation',
       'time_since_last_transaction'],
      dtype='object')

In [39]:
#Based on the criteria for this fraud detection system, now extracting the column to use

fraud_detection_df = merchant_df[['category', 'amt', 'trans_hour', 'distance_km', 'distance_from_last_transaction', 'amt_deviation',
       'time_since_last_transaction', 'velocity_1hr', 'is_night', 'is_weekend', 'is_fraud']]

In [32]:
merchant_df['amt_deviation'] = merchant_df.groupby('cc_num')['amt'].transform(
    lambda x: abs(x - x.mean())
)


In [34]:
merchant_df['time_since_last_transaction'] = merchant_df['time_diff_sec'] / 60  # minutes


In [36]:
merchant_df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,is_weekend,distance_km,prev_trans_time,time_diff_sec,velocity_1hr,prev_merch_lat,prev_merch_long,distance_from_last_transaction,amt_deviation,time_since_last_transaction
157,2020-06-21 13:05:42,60416207185,fraud_Kutch-Ferry,home,124.66,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,1,30.533617,NaT,,1.0,,,0.0,58.160516,
741,2020-06-21 16:25:36,60416207185,fraud_Halvorson Group,misc_pos,78.52,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,1,91.864216,2020-06-21 13:05:42,11994.0,1.0,42.945526,-108.530901,84.495812,12.020516,199.9
3047,2020-06-22 07:58:33,60416207185,fraud_Conroy-Cruickshank,gas_transport,65.25,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,121.877934,2020-06-21 16:25:36,55977.0,1.0,42.19313,-108.682054,210.284759,1.249484,932.95
4351,2020-06-22 15:32:31,60416207185,fraud_Larkin Ltd,kids_pets,87.74,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,65.393092,2020-06-22 07:58:33,27238.0,1.0,43.932724,-109.699794,58.169994,21.240516,453.966667
7695,2020-06-23 12:28:54,60416207185,fraud_Leffler-Goldner,personal_care,148.02,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,38.39478,2020-06-22 15:32:31,75383.0,1.0,43.546064,-109.212939,75.021641,81.520516,1256.383333


In [40]:
fraud_detection_df

Unnamed: 0,category,amt,trans_hour,distance_km,distance_from_last_transaction,amt_deviation,time_since_last_transaction,velocity_1hr,is_night,is_weekend,is_fraud
157,home,124.66,13,30.533617,0.000000,58.160516,,1.0,0,1,0
741,misc_pos,78.52,16,91.864216,84.495812,12.020516,199.900000,1.0,0,1,0
3047,gas_transport,65.25,7,121.877934,210.284759,1.249484,932.950000,1.0,0,0,0
4351,kids_pets,87.74,15,65.393092,58.169994,21.240516,453.966667,1.0,0,0,0
7695,personal_care,148.02,12,38.394780,75.021641,81.520516,1256.383333,1.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
552584,gas_transport,66.11,2,44.930513,110.438905,2.965041,134.616667,1.0,1,0,0
552892,misc_net,4.58,5,81.527378,95.794229,58.564959,183.416667,1.0,1,0,0
553559,gas_transport,95.96,11,36.017775,73.386959,32.815041,362.150000,1.0,0,0,0
553560,grocery_pos,149.48,11,81.940542,109.396278,86.335041,1.683333,1.0,0,0,0


In [42]:
fraud_detection_df.to_csv('fraud_dataset.csv', index=False)