In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
fraud_df = pd.read_csv('fraud_dataset.csv')

In [4]:
fraud_df.head()

Unnamed: 0,category,amt,trans_hour,distance_km,distance_from_last_transaction,amt_deviation,time_since_last_transaction,velocity_1hr,is_night,is_weekend,is_fraud
0,home,124.66,13,30.533617,0.0,58.160516,,1.0,0,1,0
1,misc_pos,78.52,16,91.864216,84.495812,12.020516,199.9,1.0,0,1,0
2,gas_transport,65.25,7,121.877934,210.284759,1.249484,932.95,1.0,0,0,0
3,kids_pets,87.74,15,65.393092,58.169994,21.240516,453.966667,1.0,0,0,0
4,personal_care,148.02,12,38.39478,75.021641,81.520516,1256.383333,1.0,0,0,0


In [6]:
#data cleaning and Preprocessing

fraud_df.isna().sum()

category                            0
amt                                 0
trans_hour                          0
distance_km                         0
distance_from_last_transaction      0
amt_deviation                       0
time_since_last_transaction       924
velocity_1hr                        0
is_night                            0
is_weekend                          0
is_fraud                            0
dtype: int64

In [7]:
fraud_df.dropna(inplace=True)

In [9]:
fraud_df.duplicated().sum()

0

In [12]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 554795 entries, 1 to 555718
Data columns (total 11 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   category                        554795 non-null  object 
 1   amt                             554795 non-null  float64
 2   trans_hour                      554795 non-null  int64  
 3   distance_km                     554795 non-null  float64
 4   distance_from_last_transaction  554795 non-null  float64
 5   amt_deviation                   554795 non-null  float64
 6   time_since_last_transaction     554795 non-null  float64
 7   velocity_1hr                    554795 non-null  float64
 8   is_night                        554795 non-null  int64  
 9   is_weekend                      554795 non-null  int64  
 10  is_fraud                        554795 non-null  int64  
dtypes: float64(6), int64(4), object(1)
memory usage: 50.8+ MB


In [14]:
#The Process before creating the model

#category = object dtype must be converted into one hot encoding
# apply data normalization by using standardScaler
# apply selection the data into independent and dependent columns
# apply train test split
# apply 3 algorithms on them

In [28]:
fraud_df['category'].unique()

array(['misc_pos', 'gas_transport', 'kids_pets', 'personal_care',
       'shopping_net', 'travel', 'home', 'grocery_pos', 'entertainment',
       'shopping_pos', 'grocery_net', 'misc_net', 'food_dining',
       'health_fitness'], dtype=object)

In [25]:
category_columns.columns

Index(['food_dining', 'gas_transport', 'grocery_net', 'grocery_pos',
       'health_fitness', 'home', 'kids_pets', 'misc_net', 'misc_pos',
       'personal_care', 'shopping_net', 'shopping_pos', 'travel'],
      dtype='object')

In [17]:
fraud_df['category'].nunique()

14

In [20]:
category_columns = pd.get_dummies(fraud_df['category'], drop_first=True, dtype=int)

In [21]:
final_fraud_df = pd.concat([fraud_df, category_columns], axis=1)

In [22]:
final_fraud_df

Unnamed: 0,category,amt,trans_hour,distance_km,distance_from_last_transaction,amt_deviation,time_since_last_transaction,velocity_1hr,is_night,is_weekend,...,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel
1,misc_pos,78.52,16,91.864216,84.495812,12.020516,199.900000,1.0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,gas_transport,65.25,7,121.877934,210.284759,1.249484,932.950000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,kids_pets,87.74,15,65.393092,58.169994,21.240516,453.966667,1.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,personal_care,148.02,12,38.394780,75.021641,81.520516,1256.383333,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,shopping_net,2.83,14,62.145982,97.063040,63.669484,115.900000,1.0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,gas_transport,66.11,2,44.930513,110.438905,2.965041,134.616667,1.0,1,0,...,0,0,0,0,0,0,0,0,0,0
555715,misc_net,4.58,5,81.527378,95.794229,58.564959,183.416667,1.0,1,0,...,0,0,0,0,1,0,0,0,0,0
555716,gas_transport,95.96,11,36.017775,73.386959,32.815041,362.150000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
555717,grocery_pos,149.48,11,81.940542,109.396278,86.335041,1.683333,1.0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [23]:
X_fraud = final_fraud_df.drop(['category', 'is_fraud'], axis=1)
y_fraud = final_fraud_df.is_fraud

In [24]:
X_fraud

Unnamed: 0,amt,trans_hour,distance_km,distance_from_last_transaction,amt_deviation,time_since_last_transaction,velocity_1hr,is_night,is_weekend,food_dining,...,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel
1,78.52,16,91.864216,84.495812,12.020516,199.900000,1.0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,65.25,7,121.877934,210.284759,1.249484,932.950000,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,87.74,15,65.393092,58.169994,21.240516,453.966667,1.0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,148.02,12,38.394780,75.021641,81.520516,1256.383333,1.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,2.83,14,62.145982,97.063040,63.669484,115.900000,1.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,66.11,2,44.930513,110.438905,2.965041,134.616667,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
555715,4.58,5,81.527378,95.794229,58.564959,183.416667,1.0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
555716,95.96,11,36.017775,73.386959,32.815041,362.150000,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
555717,149.48,11,81.940542,109.396278,86.335041,1.683333,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [30]:
# applying data normalization

scaler = StandardScaler()

X_fraud_scaled = scaler.fit_transform(X_fraud)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_fraud_scaled, y_fraud, test_size=0.2, random_state=42)

In [32]:
#First Model

model_logistic = LogisticRegression()

In [33]:
model_logistic.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [34]:
model_logistic.score(X_train, y_train)

0.9961855279878153

In [35]:
model_logistic.score(X_test, y_test)

0.99606160834182

In [None]:
#here the training score = 99.61% while the test score is 99.60%. which is the same.
#This is as a system of the is_fraud class A vs class B
# I will check it out after the second model

In [36]:
#Second Model

model_xgb = XGBClassifier()

In [37]:
model_xgb.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [38]:
model_xgb.score(X_train, y_train)

0.9992339512793014

In [39]:
model_xgb.score(X_test, y_test)

0.9980713596914176

In [40]:
#same as the first model

In [41]:
model_random = RandomForestClassifier()

In [42]:
model_random.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
model_random.score(X_train, y_train)

0.9999977469155273

In [44]:
model_random.score(X_test, y_test)

0.9983056804765724

In [None]:
#same as model 1 and 2

In [45]:
#checking the class A vs class B in is_fraud columns 
# Fraud Detection System Part 3

In [69]:
final_fraud_df.shape

(554795, 24)

In [70]:

final_fraud_df.to_csv('fraud_dataset1.csv', index=False)