In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, roc_curve, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [43]:
try:
    df = pd.read_csv('data/card_transdata.csv')
except:
    df = pd.read_csv('creditcard.csv')

In [44]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,4.567738,0.18192,0.782519,1.0,0.0,0.0,0.0,0.0
1,14.275128,2.989076,0.577302,1.0,1.0,0.0,1.0,0.0
2,4.647121,0.653272,1.961686,1.0,1.0,0.0,1.0,0.0
3,83.55236,0.045183,4.05303,1.0,1.0,0.0,1.0,1.0
4,6.770671,0.500616,2.985819,1.0,0.0,0.0,1.0,0.0


In [50]:
df.size, df.info(), df[df.columns[0]].count(), len(df.index)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   distance_from_home              700000 non-null  float64
 1   distance_from_last_transaction  700000 non-null  float64
 2   ratio_to_median_purchase_price  700000 non-null  float64
 3   repeat_retailer                 700000 non-null  float64
 4   used_chip                       700000 non-null  float64
 5   used_pin_number                 700000 non-null  float64
 6   online_order                    700000 non-null  float64
 7   fraud                           700000 non-null  float64
dtypes: float64(8)
memory usage: 42.7 MB


(5600000, None, 700000, 700000)

In [59]:
category_features = ['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']
numeric_features = ['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price']
target = "fraud"

In [60]:
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=17)

clf = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

In [64]:
#Checking if the model is overfit or underfit

# f1_score on train data
X_train_pred = clf.predict(X_train)
train_data_f1 = f1_score(X_train_pred, y_train)
print("Train f1 Score: ", train_data_f1)

# f1_score on test data
train_data_f1 = f1_score(y_pred, y_test)
print("Test f1 Score: ", train_data_f1)

Train f1 Score:  0.7104233806489705
Test f1 Score:  0.7085821447959335


In [103]:
param_grid = [    
    {'penalty' : ['l1', 'l2'],
    # 'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear'],
    'max_iter' : [100, 1000, 5000]
    }
]

clf = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Booklyn\miniconda3\envs\fraud\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Booklyn\miniconda3\envs\fraud\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Booklyn\miniconda3\envs\fraud\lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Booklyn\miniconda3\envs\fraud\lib\site-packages

In [104]:
best_clf.best_estimator_

In [105]:
print (f'Accuracy - : {best_clf.score(X_test, y_test):.3f}')

Accuracy - : 0.959


In [107]:
from joblib import load, dump
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(best_clf, f_out)