In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
# Load the dataset
data = pd.read_csv('data/final_merged_data.csv')

In [17]:
# Explore dataset
# Set the option to display all columns
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,User,Card,Month,Day,Time,Use Chip,Merchant City,MCC,Errors?,Is Fraud?,City,Zipcode,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Age Group,CARD INDEX,Card Brand,Card Type,Has Chip,Cards Issued,Credit Limit,Year PIN last Changed,Card on Dark Web
0,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,La Verne,91750,$29278,$59696,$127613,787,5,46-60,0,Visa,Debit,YES,2,$24295,2008,No
1,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,La Verne,91750,$29278,$59696,$127613,787,5,46-60,1,Visa,Debit,YES,2,$21968,2014,No
2,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,La Verne,91750,$29278,$59696,$127613,787,5,46-60,2,Visa,Debit,YES,2,$46414,2004,No
3,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,La Verne,91750,$29278,$59696,$127613,787,5,46-60,3,Visa,Credit,NO,1,$12400,2012,No
4,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,La Verne,91750,$29278,$59696,$127613,787,5,46-60,4,Mastercard,Debit (Prepaid),YES,1,$28,2009,No


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7271702 entries, 0 to 7271701
Data columns (total 26 columns):
 #   Column                       Dtype 
---  ------                       ----- 
 0   User                         int64 
 1   Card                         int64 
 2   Month                        int64 
 3   Day                          int64 
 4   Time                         object
 5   Use Chip                     object
 6   Merchant City                object
 7   MCC                          int64 
 8   Errors?                      object
 9   Is Fraud?                    object
 10  City                         object
 11  Zipcode                      int64 
 12  Per Capita Income - Zipcode  object
 13  Yearly Income - Person       object
 14  Total Debt                   object
 15  FICO Score                   int64 
 16  Num Credit Cards             int64 
 17  Age Group                    object
 18  CARD INDEX                   int64 
 19  Card Brand           

In [19]:
data.describe()

Unnamed: 0,User,Card,Month,Day,MCC,Zipcode,FICO Score,Num Credit Cards,CARD INDEX,Cards Issued,Year PIN last Changed
count,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0,7271702.0
mean,1002.786,1.635586,5.748276,15.63978,5568.071,51148.25,719.6575,4.27336,1.63668,1.501186,2012.59
std,566.8522,1.522303,3.662047,8.762368,885.4286,29592.88,62.67611,1.590418,1.5105,0.5177678,3.765733
min,0.0,0.0,1.0,1.0,1711.0,1060.0,480.0,1.0,0.0,1.0,2002.0
25%,516.0,0.0,2.0,8.0,5300.0,28312.0,688.0,3.0,0.0,1.0,2010.0
50%,1006.0,1.0,5.0,16.0,5499.0,47446.0,720.0,4.0,1.0,1.0,2012.0
75%,1478.0,3.0,9.0,23.0,5813.0,77856.0,761.0,5.0,3.0,2.0,2015.0
max,1999.0,8.0,12.0,31.0,9402.0,99508.0,850.0,9.0,8.0,3.0,2020.0


In [20]:
data.isnull().sum()

User                                 0
Card                                 0
Month                                0
Day                                  0
Time                                 0
Use Chip                             0
Merchant City                        0
MCC                                  0
Errors?                        7156230
Is Fraud?                            0
City                                 0
Zipcode                              0
Per Capita Income - Zipcode          0
Yearly Income - Person               0
Total Debt                           0
FICO Score                           0
Num Credit Cards                     0
Age Group                         7304
CARD INDEX                           0
Card Brand                           0
Card Type                            0
Has Chip                             0
Cards Issued                         0
Credit Limit                         0
Year PIN last Changed                0
Card on Dark Web         

In [31]:
age_group_summary = data['Age Group'].value_counts()
age_group_summary

Over 60    2531419
46-60      2464625
36-45      1343411
26-35       767961
18-25       156982
Name: Age Group, dtype: int64

In [22]:
age_group_summary.info()

<class 'pandas.core.series.Series'>
Index: 5 entries, Over 60 to 18-25
Series name: Age Group
Non-Null Count  Dtype
--------------  -----
5 non-null      int64
dtypes: int64(1)
memory usage: 80.0+ bytes


In [23]:
age_group_summary.shape

(5,)

In [32]:
errors_summary = data['Errors?'].value_counts()
errors_summary

Insufficient Balance                     71340
Bad PIN                                  17741
Technical Glitch                         14383
Bad Card Number                           4306
Bad Expiration                            3295
Bad CVV                                   3280
Bad Zipcode                                545
Bad PIN,Insufficient Balance               218
Insufficient Balance,Technical Glitch      122
Bad PIN,Technical Glitch                    56
Bad Card Number,Insufficient Balance        44
Bad Card Number,Bad CVV                     40
Bad CVV,Insufficient Balance                24
Bad Expiration,Insufficient Balance         22
Bad Expiration,Bad CVV                      20
Bad Expiration,Technical Glitch             11
Bad Zipcode,Insufficient Balance             7
Bad Card Number,Technical Glitch             5
Bad Card Number,Bad Expiration               5
Bad Zipcode,Technical Glitch                 4
Bad CVV,Technical Glitch                     4
Name: Errors?

In [25]:
data.nunique()

User                           1948
Card                              9
Month                            12
Day                              31
Time                           1440
Use Chip                          3
Merchant City                  8581
MCC                             109
Errors?                          21
Is Fraud?                         2
City                           1258
Zipcode                        1769
Per Capita Income - Zipcode    1714
Yearly Income - Person         1899
Total Debt                     1829
FICO Score                      316
Num Credit Cards                  9
Age Group                         5
CARD INDEX                        9
Card Brand                        4
Card Type                         3
Has Chip                          2
Cards Issued                      3
Credit Limit                   3626
Year PIN last Changed            19
Card on Dark Web                  1
dtype: int64

In [30]:
data.shape

(7271702, 26)

In [26]:
# Handle missing values 
data_dropna = data.dropna()

In [27]:
data_dropna.nunique()

User                           1844
Card                              9
Month                            12
Day                              31
Time                           1439
Use Chip                          3
Merchant City                  3172
MCC                             100
Errors?                          21
Is Fraud?                         2
City                           1214
Zipcode                        1687
Per Capita Income - Zipcode    1637
Yearly Income - Person         1802
Total Debt                     1732
FICO Score                      312
Num Credit Cards                  9
Age Group                         5
CARD INDEX                        9
Card Brand                        4
Card Type                         3
Has Chip                          2
Cards Issued                      3
Credit Limit                   3559
Year PIN last Changed            19
Card on Dark Web                  1
dtype: int64

In [29]:
data_dropna.shape

(115108, 26)

In [34]:
# Define features and target variable
features = ['User', 'Card', 'Month', 'Day', 'Time', 'Use Chip', 'Merchant City', 'MCC', 
            'Errors?', 'City', 'Zipcode', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 
            'Total Debt', 'FICO Score', 'Num Credit Cards', 'Age Group', 'CARD INDEX', 'Card Brand', 
            'Card Type', 'Has Chip', 'Cards Issued', 'Credit Limit', 'Year PIN last Changed', 
            'Card on Dark Web']
target = 'Is Fraud?'

X = data[features]
y = data[target]

In [39]:
# Clean numerical columns by removing '$' and ',' from the 'Per Capita Income - Zipcode' and 'Yearly Income - Person' columns and convert them to float
X.loc[:, 'Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
X.loc[:, 'Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)


In [38]:
# Clean numerical columns by removing '$' and ',' from the 'Per Capita Income - Zipcode' and 'Yearly Income - Person' columns and convert them to float
X['Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
X['Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)


In [40]:
# Define categorical and numerical columns
categorical_cols = ['User', 'Card', 'Month', 'Day', 'Time', 'Use Chip', 'Merchant City', 
                    'Errors?', 'City', 'Age Group', 'Card Brand', 'Card Type', 'Has Chip', 
                    'Card on Dark Web']
numerical_cols = ['MCC', 'Zipcode', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 
                  'Total Debt', 'FICO Score', 'Num Credit Cards', 'CARD INDEX', 'Cards Issued', 
                  'Credit Limit', 'Year PIN last Changed']

In [41]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [42]:
# Define the model - Random Forest Classifier 
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=10, criterion='gini', random_state=42))
])

In [None]:
# Define the model - Logistic Regression 
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [43]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

ValueError: could not convert string to float: '$36199'