## Import Libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:

train_df = pd.read_csv('/content/fraudTrain.csv')

# Explore the data (e.g., check columns, summary statistics, etc.)
print(train_df.head())


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [None]:
train_df.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [None]:
def data_pre(X):
    del_col=['merchant','first','last','street','zip','unix_time','Unnamed: 0','trans_num','cc_num']
    X.drop(columns=del_col,inplace=True)


    X['trans_date_trans_time']=pd.to_datetime(X['trans_date_trans_time'])
    X['trans_date']=X['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
    X['trans_date']=pd.to_datetime(X['trans_date'])


    X['dob']=pd.to_datetime(X['dob'])

    #Calculate Age of each trans
    X["age"] = (X["trans_date"] - X["dob"]).dt.days //365


    X['trans_month']=X['trans_date'].dt.month
    X['trans_year']=X['trans_date'].dt.year

    X['gender']=X['gender'].apply(lambda x : 1 if x=='M' else 0)
    X['gender']=X['gender'].astype(int)
    X['lat_dis']=abs(X['lat']-X['merch_lat'])
    X['long_dis']=abs(X['long']-X['merch_long'])
    X=pd.get_dummies(X,columns=['category'])
    X=X.drop(columns=['city','trans_date_trans_time','state','job','merch_lat','merch_long','lat','long','dob','trans_date'])
    return X


In [None]:
train_df_pre=data_pre(train_df.copy())
train_df_pre.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,age,trans_month,trans_year,lat_dis,long_dis,category_entertainment,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,3495,0,30,1,2019,0.067507,0.870215,0,...,0,0,0,0,1,0,0,0,0,0
1,107.23,0,149,0,40,1,2019,0.271247,0.024038,0,...,1,0,0,0,0,0,0,0,0,0
2,220.11,1,4154,0,56,1,2019,0.969904,0.107519,1,...,0,0,0,0,0,0,0,0,0,0
3,45.0,1,1939,0,52,1,2019,0.803731,0.447271,0,...,0,0,0,0,0,0,0,0,0,0
4,41.96,1,99,0,32,1,2019,0.254299,0.830441,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
X = train_df.drop('is_fraud', axis=1)
y = train_df['is_fraud']

In [None]:
test_df_pre=data_pre(train_df.copy())
test_df_pre.head()
X=test_df_pre.drop('is_fraud',axis=1)
y=test_df_pre['is_fraud']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler

# Step 1: Fit the StandardScaler on the training data
scaler = StandardScaler()
scaler.fit(X_train)
x_train=scaler.transform(X_train)
x_test=scaler.transform(X_test)

In [None]:
logistic_regression=LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred_logistic = logistic_regression.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_logistic

0.9936067248925136

In [None]:
DecisionTree=DecisionTreeClassifier()
DecisionTree.fit(X_train,y_train)
y_pred_dt = DecisionTree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt

0.9961594077158887

In [None]:
random_forest = RandomForestClassifier(random_state=42,n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf

0.9974897333564694

In [None]:
print("\nClassification Report for Logistic Regression:\n", classification_report(y_test, y_pred_logistic))
print("\nClassification Report for Decision Tree:\n", classification_report(y_test, y_pred_dt))
print("\nClassification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))


Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    257815
           1       0.00      0.00      0.00      1520

    accuracy                           0.99    259335
   macro avg       0.50      0.50      0.50    259335
weighted avg       0.99      0.99      0.99    259335


Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.67      0.68      0.68      1520

    accuracy                           1.00    259335
   macro avg       0.83      0.84      0.84    259335
weighted avg       1.00      1.00      1.00    259335


Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.87      0.67      0.76      1520

    accuracy                           1.00    