In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
train = pd.read_csv('/content/fraudTrain.csv')
test = pd.read_csv('/content/fraudTest.csv')

In [3]:
print(train.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [4]:
train['trans_date_trans_time'] = pd.to_datetime(train['trans_date_trans_time'])
train['dob'] = pd.to_datetime(train['dob'])
test['trans_date_trans_time'] = pd.to_datetime(test['trans_date_trans_time'])
test['dob'] = pd.to_datetime(test['dob'])

In [5]:
train['transaction_hour'] = train['trans_date_trans_time'].dt.hour
train['transaction_day'] = train['trans_date_trans_time'].dt.day
train['age'] = (train['trans_date_trans_time'] - train['dob']).dt.days // 365

In [6]:
test['transaction_hour'] = test['trans_date_trans_time'].dt.hour
test['transaction_day'] = test['trans_date_trans_time'].dt.day
test['age'] = (test['trans_date_trans_time'] - test['dob']).dt.days // 365

In [7]:
cols_to_drop = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 'trans_num']
train = train.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)

In [8]:
train = pd.get_dummies(train, columns=['merchant', 'category', 'gender', 'job'], drop_first=True)
test = pd.get_dummies(test, columns=['merchant', 'category', 'gender', 'job'], drop_first=True)

In [9]:
train, test = train.align(test, join='left', axis=1)

In [10]:
test = test.fillna(0)

In [11]:
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']

In [12]:
print("Number of missing values in 'is_fraud':", y.isna().sum())
X = X[y.notna()]
y = y[y.notna()]


Number of missing values in 'is_fraud': 1


In [13]:
y = y.fillna(0)

In [14]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
num_cols = ['amt', 'city_pop', 'merch_lat', 'merch_long', 'age']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [17]:
logistic_model = LogisticRegression(random_state=42)
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)

In [19]:
logistic_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

In [20]:
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    print(confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred))
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("Logistic Regression:")
evaluate_model(logistic_model, X_val, y_val)

print("Decision Tree:")
evaluate_model(decision_tree_model, X_val, y_val)

print("Random Forest:")
evaluate_model(random_forest_model, X_val, y_val)

Logistic Regression:
[[10477 10328]
 [10407 10382]]
              precision    recall  f1-score   support

         0.0       0.50      0.50      0.50     20805
         1.0       0.50      0.50      0.50     20789

    accuracy                           0.50     41594
   macro avg       0.50      0.50      0.50     41594
weighted avg       0.50      0.50      0.50     41594

Accuracy: 0.5015
Decision Tree:
[[20588   217]
 [   38 20751]]
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     20805
         1.0       0.99      1.00      0.99     20789

    accuracy                           0.99     41594
   macro avg       0.99      0.99      0.99     41594
weighted avg       0.99      0.99      0.99     41594

Accuracy: 0.9939
Random Forest:
[[20803     2]
 [    8 20781]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     20805
         1.0       1.00      1.00      1.00     20789

    

In [21]:
X_test = test.drop(columns=['is_fraud'])
y_test = test['is_fraud']
print("Test Set Evaluation:")

for model, name in zip([logistic_model, decision_tree_model, random_forest_model],
                       ['Logistic Regression', 'Decision Tree', 'Random Forest']):
    print(f"\n{name}:")
    evaluate_model(model, X_test, y_test)

Test Set Evaluation:

Logistic Regression:
[[54710 53701]
 [  215   210]]
              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67    108411
         1.0       0.00      0.49      0.01       425

    accuracy                           0.50    108836
   macro avg       0.50      0.50      0.34    108836
weighted avg       0.99      0.50      0.67    108836

Accuracy: 0.5046

Decision Tree:
[[105621   2790]
 [    99    326]]
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99    108411
         1.0       0.10      0.77      0.18       425

    accuracy                           0.97    108836
   macro avg       0.55      0.87      0.59    108836
weighted avg       1.00      0.97      0.98    108836

Accuracy: 0.9735

Random Forest:
[[108408      3]
 [   316    109]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    108411
         1.0       0.97    

In [28]:
print(X_train.isnull().sum())
print(X_train.dtypes)


Unnamed: 0                     0
amt                            0
city_pop                       0
unix_time                      0
merch_lat                      0
                              ..
job_Water engineer             0
job_Water quality scientist    0
job_Web designer               0
job_Wellsite geologist         0
job_Writer                     0
Length: 1193, dtype: int64
Unnamed: 0                       int64
amt                            float64
city_pop                       float64
unix_time                      float64
merch_lat                      float64
                                ...   
job_Water engineer                bool
job_Water quality scientist       bool
job_Web designer                  bool
job_Wellsite geologist            bool
job_Writer                        bool
Length: 1193, dtype: object


In [31]:
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_val.columns = X_val.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
print(X_train.columns)
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
lgb_model.fit(X_train, y_train)
print("\nEvaluation on Validation Set:")
evaluate_model(lgb_model, X_val, y_val)
print("\nEvaluation on Test Set:")
evaluate_model(lgb_model, X_test, y_test)


Index(['Unnamed_0', 'amt', 'city_pop', 'unix_time', 'merch_lat', 'merch_long',
       'transaction_hour', 'transaction_day', 'age',
       'merchant_fraud_Abbott_Steuber',
       ...
       'job_Video_editor', 'job_Visual_merchandiser',
       'job_Volunteer_coordinator', 'job_Warden_ranger',
       'job_Waste_management_officer', 'job_Water_engineer',
       'job_Water_quality_scientist', 'job_Web_designer',
       'job_Wellsite_geologist', 'job_Writer'],
      dtype='object', length=1193)
[LightGBM] [Info] Number of positive: 83195, number of negative: 83179
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.190751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4036
[LightGBM] [Info] Number of data points in the train set: 166374, number of used features: 1193
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500048 -> initscore=0.00019

In [32]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'num_leaves': randint(20, 50),
    'min_child_samples': randint(10, 50)
}
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    cv=3,
    random_state=42,
    verbose=1
)
X_sample = X_train.sample(frac=0.2, random_state=42)
y_sample = y_train.loc[X_sample.index]
random_search.fit(X_sample, y_sample)
print(f"Best parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_
print("\nBest Model Evaluation on Validation Set:")
evaluate_model(best_model, X_val, y_val)
print("\nBest Model Evaluation on Test Set:")
evaluate_model(best_model, X_test, y_test)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 16584, number of negative: 16691
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.123258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2760
[LightGBM] [Info] Number of data points in the train set: 33275, number of used features: 555
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498392 -> initscore=-0.006431
[LightGBM] [Info] Start training from score -0.006431
Best parameters: {'learning_rate': 0.18323522915498705, 'max_depth': 6, 'min_child_samples': 49, 'n_estimators': 180, 'num_leaves': 41}

Best Model Evaluation on Validation Set:
[[20754    51]
 [   43 20746]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     20805
         1.0       1.00      1.00      1.00     20789

    accuracy                           1.00     41594
   macro avg     

In [33]:
def preprocess_user_input(data, scaler, columns):
    data = pd.get_dummies(data, columns=['merchant', 'category', 'gender', 'job'], drop_first=True)
    data = data.reindex(columns=columns, fill_value=0)
    num_cols = ['amt', 'city_pop', 'merch_lat', 'merch_long', 'age']
    data[num_cols] = scaler.transform(data[num_cols])
    return data
def predict_fraud(model, scaler, columns):
    print("\nEnter transaction details:")
    merchant = input("Merchant name: ")
    category = input("Transaction category (e.g., shopping, food, etc.): ")
    amt = float(input("Transaction amount ($): "))
    gender = input("Gender (M/F): ")
    job = input("Job title: ")
    city_pop = int(input("City population: "))
    merch_lat = float(input("Merchant latitude: "))
    merch_long = float(input("Merchant longitude: "))
    age = int(input("Age of the cardholder: "))
    user_data = pd.DataFrame({
        'amt': [amt],
        'city_pop': [city_pop],
        'merch_lat': [merch_lat],
        'merch_long': [merch_long],
        'age': [age],
        'merchant': [merchant],
        'category': [category],
        'gender': [gender],
        'job': [job]
    })
    user_data = preprocess_user_input(user_data, scaler, columns)
    prediction = model.predict(user_data)
    if prediction[0] == 1:
        print("\nThe transaction is predicted to be **FRAUDULENT**.")
    else:
        print("\nThe transaction is predicted to be **LEGITIMATE**.")
print("\nUsing the best trained Random Forest model for prediction.")
predict_fraud(best_model, scaler, X.columns)



Using the best trained Random Forest model for prediction.

Enter transaction details:
Merchant name: fraud_Kirlin and Sons
Transaction category (e.g., shopping, food, etc.): personal_care
Transaction amount ($): 2.86
Gender (M/F): M
Job title: Mechanical engineer
City population: 333497
Merchant latitude: 33.986391
Merchant longitude: -81.200714
Age of the cardholder: 40

The transaction is predicted to be **LEGITIMATE**.
