In [27]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

from src.data.data_load import load_csv
from src.models.data_prep import data_preparation
from src.models.train_model import train_model
from  src.models.train_model import fine_tuning_model
from src.models.test_model import model_val_accuracy
from src.models.test_model import model_accuracy
from src.models.utils_model import save_model


### Data preparation

In [4]:
fraud_data = load_csv("../data/processed/fraud_data_v2025-02-18_0.23068032562127705.csv")

In [5]:
fraud_data.shape

(14380, 8)

In [6]:
fraud_data.head()

Unnamed: 0,trans_time,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,0,14.37,64.7556,-165.6723,145,65.654142,-164.722603,1
1,15,966.11,64.7556,-165.6723,145,65.468863,-165.473127,1
2,22,49.61,64.7556,-165.6723,145,65.347667,-165.914542,1
3,23,295.26,64.7556,-165.6723,145,64.445035,-166.080207,1
4,23,18.17,64.7556,-165.6723,145,65.447094,-165.446843,1


In [7]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = data_preparation(data=fraud_data, target="is_fraud")

In [8]:
X_train.shape, y_train.shape

((11504, 7), (11504,))

In [9]:
X_val.shape, y_val.shape

((1438, 7), (1438,))

In [10]:
X_train

array([[-1.52305517,  0.36684059, -1.29510039, ...,  3.91964644,
        -1.32310024, -0.53106986],
       [-1.38527176, -0.04864203,  0.19746507, ..., -0.36303183,
         0.06953376,  1.1413922 ],
       [-0.55857133, -0.29861756,  0.33652503, ..., -0.27165445,
         0.36500835,  0.20081971],
       ...,
       [ 1.37039633, -0.33436604,  0.30212529, ..., -0.24609338,
         0.15858207,  0.37660197],
       [-1.66083857,  0.64979441,  1.48583307, ...,  2.52562844,
         1.58296988, -0.85281528],
       [-0.83413814, -0.20126561, -1.4341224 , ..., -0.36162252,
        -1.3621358 ,  0.45095517]], shape=(11504, 7))

### Training and Validation model

In [11]:
KNN_model = train_model(KNeighborsClassifier(), X_train, y_train)

In [12]:
svm_model = train_model(SVC(), X_train, y_train)

In [13]:
reg_log_model = train_model(LogisticRegression(), X_train, y_train)

In [14]:
knn_scores = model_val_accuracy(KNN_model, y_train, y_val, X_train, X_val)
svm_scores = model_val_accuracy(svm_model, y_train, y_val, X_train, X_val)
reg_log_scores = model_val_accuracy(reg_log_model, y_train, y_val, X_train, X_val)

val_scores = [
    knn_scores[0], knn_scores[1],
    svm_scores[0], svm_scores[1],
    reg_log_scores[0], reg_log_scores[1],
]


In [15]:
results = {
    "Model": ["KNN", "SVM", "Logistic Regression"],
    "train_accuracy": [knn_scores[0], svm_scores[0], reg_log_scores[0]],
    "val_accuracy": [knn_scores[1], svm_scores[1], reg_log_scores[1]]
}

accuracy_scores = pd.DataFrame(results)

In [16]:
accuracy_scores

Unnamed: 0,Model,train_accuracy,val_accuracy
0,KNN,0.960362,0.944367
1,SVM,0.946019,0.941586
2,Logistic Regression,0.926026,0.911683


### Model Fine tunning

In [17]:
params = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'metric': ['euclidean', 'manhattan']
}
KNN_estimator = fine_tuning_model(KNeighborsClassifier(), params, 5, X_train, y_train)
KNN_final_model = KNN_estimator.best_estimator_

In [18]:
params = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1],
}
SVM_estimator = fine_tuning_model(SVC(), params, 5, X_train, y_train)
SVM_final_model = SVM_estimator.best_estimator_

In [19]:
params = {
    'C': [1, 10, 100, 1000],
}
log_reg_estimator = fine_tuning_model(LogisticRegression(), params, 5, X_train, y_train)
log_reg_final_model = log_reg_estimator.best_estimator_

In [20]:
knn_scores = model_val_accuracy(KNN_final_model, y_train, y_val, X_train, X_val)
svm_scores = model_val_accuracy(SVM_final_model, y_train, y_val, X_train, X_val)
reg_log_scores = model_val_accuracy(log_reg_final_model, y_train, y_val, X_train, X_val)

new_val_scores = [
    knn_scores[0], knn_scores[1],
    svm_scores[0], svm_scores[1],
    reg_log_scores[0], reg_log_scores[1],
]


In [21]:
results = {
    "Model": ["KNN", "SVM", "Logistic Regression"],
    "train_accuracy": [knn_scores[0], svm_scores[0], reg_log_scores[0]],
    "val_accuracy": [knn_scores[1], svm_scores[1], reg_log_scores[1]]
}

accuracy_scores = pd.DataFrame(results)

In [22]:
accuracy_scores

Unnamed: 0,Model,train_accuracy,val_accuracy
0,KNN,0.956798,0.943672
1,SVM,0.959145,0.947844
2,Logistic Regression,0.926113,0.911683


### Model Assessment

In [23]:
accuracy = model_accuracy(SVM_final_model, y_test, X_test)
print(f"SVM model accuracy: {accuracy}")

SVM model accuracy: 0.9596662030598053


In [24]:
accuracy = model_accuracy(KNN_final_model, y_test, X_test)
print(f"KNN model accuracy: {accuracy}")

KNN model accuracy: 0.958970792767733


In [25]:
accuracy = model_accuracy(log_reg_final_model, y_test, X_test)
print(f"Logistic Regression model accuracy: {accuracy}")

Logistic Regression model accuracy: 0.9304589707927677


### Save the model

The best model after assessment is SVM model

In [28]:
save_model(SVM_final_model, "../models")