In [None]:
# Load the dataset and get the overall information:
import pandas as pd
df = pd.read_csv('/Users/trangnguyen/Documents/GitHub/PersonalPrj/train.csv')
df.head()

In [None]:
import pandas as pd
import numpy as np
import copy
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib 
import matplotlib.patches as mpatches

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings("ignore")

In [None]:
df.shape

## I. Data Cleaning:

In [None]:
# Check is is there any null value 
df.isnull().sum()

In [None]:
# Show unique Age values
df['Age'].unique()

In [None]:
# Function to shorten the age, if they are 5 digits remove to 2 digits only
def shorten_age(age):
    if abs(age) >= 1000:
        return int(str(age)[:2])  
    else:
        return age  

def positive_age(age):
    if age < 0: 
        return -age
    else:
        return age

df['Age'] = df['Age'].apply(shorten_age).apply(positive_age)
print(df['Age'].unique())


In [None]:
# Show unique DeviceType values
df['DeviceType'].unique()

In [None]:
# Merge them into 3 groups: Desktop, Mobile, Table
df['DeviceType'] = df['DeviceType'].replace({'mob':'Mobile', 'iphone 15' : 'Mobile', 'android': 'Mobile', 'smartphone': 'Mobile', 'galaxys7': 'Mobile'}) 
df['DeviceType'].unique()


In [None]:
df['Gender'] = df['Gender'].replace({'he':'Male', 'man' : 'Male', 'isnotfemale': 'Male'})
df['Gender'] = df['Gender'].replace({'fem':'Female', 'isnotmale' : 'Female', 'woman': 'Female', 'she': 'Female'})
df['Gender'] = df['Gender'].replace('Male', '0')
df['Gender'] = df['Gender'].replace('Female', '1')
df['Gender'].unique()

In [None]:
# Replace inconsistent values
df['Income'] = df['Income'].str.replace('$','').str.replace( 'AU$' , '').str.replace('AUD' ,'').str.replace('AU', '').astype(float)
print(df['Income'].describe())

In [None]:
# Convert Expenditure:
df['Expenditure'] = df['Expenditure'].replace({'AU$36604.93': '36604.93'}, regex=True)
# Clean the 'Expenditure' column by removing currency symbols and unwanted characters
df['Expenditure'] = df['Expenditure'].replace({'AU\$': '', 'AUD': '', 'AED': '', ' ': ''}, regex=True)

# Convert the cleaned column to numeric values
df['Expenditure'] = pd.to_numeric(df['Expenditure'], errors='coerce')

In [None]:
def change_currency(n):
    n = str(n)
    
    if n.endswith('AUD') or n.startswith('AU$'):
        n = n.replace('AU$', '').replace('AUD', '').strip()
        try:
            n = float(n)
            return n / 1.96  
        except ValueError:
            return None  
    else:
        n = n.replace('GBP', '').replace('£', '').replace('¬', '').strip()
        try:
            n = float(n)
            return n
        except ValueError:
            return None

In [None]:
df['GiftsTransaction'] = df['GiftsTransaction'].apply(change_currency)

In [None]:
df['TransactionAmount'] = df['TransactionAmount'].replace({'AU\$': '', 'AUD': '', 'AED': '', ' ': ''}, regex=True)

In [None]:
df = df.drop('TransactionNumber', axis = 1)
df = df.drop('UserID', axis = 1)

In [None]:
df['TransactionAmount'] = pd.to_numeric(df['TransactionAmount'], errors='coerce').astype('float64')

In [None]:
df.info()

In [None]:
df = df.drop('EmailDomain', axis = 1)
df = df.drop('Latitude', axis = 1)
df = df.drop('Longitude', axis = 1)

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
# Get all the categorical columns:
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns.drop('TransactionDate')
categorical_columns = categorical_columns.drop('TransactionTime')
categorical_columns = categorical_columns.drop('TransactionLocation')
categorical_columns = categorical_columns.drop('MerchantID')


In [None]:
encode1 = LabelEncoder()
df['TransactionLocation'] = encode1.fit_transform(df['TransactionLocation'])
df['Terrorism'] = encode1.fit_transform(df[['Terrorism']])
df['TransactionDate'] = encode1.fit_transform(df['TransactionDate'])
df['TransactionTime'] = encode1.fit_transform(df['TransactionTime'])
df['MerchantID'] = encode1.fit_transform(df['MerchantID'])


In [None]:
from sklearn.preprocessing import OneHotEncoder
df = pd.get_dummies(df, columns=categorical_columns, dtype = int)

In [None]:
df.corr()

In [None]:
df.info()

In [None]:
from sklearn. linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn. model_selection import train_test_split 
from sklearn.metrics import accuracy_score

X = df.drop(columns=['IsFraud']) 
y = df['IsFraud']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic regression

In [None]:
# Create an instance of the StandardScaler for normalization
scaler = StandardScaler ()

# Normalize the features in X
X_train_normalized = scaler.fit_transform(X_train)

# Create an instance of Logistic Regression
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train_normalized, y_train)

# Get the best fitted line:
print("y = x *", logreg.coef_, "+", logreg.intercept_)

# Normalize the features in X
X_test_normalized = scaler.fit_transform(X_test)

# Testing the model:
y_pred = logreg.predict(X_test_normalized)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Evaluate the model:
# Doing evaluation
print("The accuracy score is: ", accuracy_score(y_test, y_pred))

In [None]:
# Creating RFE object
lr_model = LogisticRegression()
rfe = RFE(estimator=lr_model, n_features_to_select=3, step=1)
rfe.fit(X_train_normalized, y_train)

In [None]:
# Doing evaluation
y_test_hat = rfe.predict(X_test_normalized)
print("The accuracy score is: ", accuracy_score(y_test, y_test_hat))

In [None]:
# Summarize all features
for i in range(X_train.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

In [None]:
# To increment number of features, one at each time
acc_scores = []
for i in range(1,33):
    clf = LogisticRegression()
    rfe = RFE(estimator=clf, n_features_to_select=i)
    # Training model
    rfe.fit(X_train_normalized, y_train)
    # Predicting on test set
    y_pred = rfe.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, y_pred)
    # Print this
    print("Accuracy on test set using", i, "features: ", acc_score)
    # Append to the list
    acc_scores.append(acc_score)

# Estimating accuracy score on test set using RFE by using different number of features
estimator = LogisticRegression()
acc_scores = []
for i in range(1, 33):
    selector = RFE(estimator,n_features_to_select=i)
    selector = selector.fit(X_train_normalized, y_train)
    supp = selector.get_support()

    predicted = selector.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, predicted)
    acc_scores.append(acc_score)

best = 1
for item1 in acc_scores:
    if item1 > acc_scores[best - 1]:
        best = acc_scores.index(item1) + 1
plt.grid()
plt.xlabel('# No. of features')
plt.ylabel('Accuracy score on test set')
plt.plot(range(1, 33), acc_scores, marker = 'o', color = 'lightblue', markeredgewidth = 1 ,markeredgecolor = 'lightblue', markerfacecolor = 'None')
plt.plot(best, acc_scores[best-1], marker = 'o', markerfacecolor = 'red')

In [None]:
# Training Logistic Regression model
model = LogisticRegression()
model.fit(X_train_normalized, y_train)
# Doing predictions on train and test set
y_hat_train = model.predict(X_train_normalized)
y_hat_test = model.predict(X_test_normalized)

In [None]:
# Evaluate the performance of the trained model
print("Accuracy score on training set: ", accuracy_score(y_train, y_hat_train))
print("Accuracy score on testing set: ", accuracy_score(y_test, y_hat_test))
# Checking confusion matrix
print("Confusion matrix on test set: ")
print(confusion_matrix(y_test, y_hat_test))
print("Confusion matrix on train set: ")
print(confusion_matrix(y_train, y_hat_train))

### Decision Tree

In [None]:
# Create and train a Decision Tree classifier model
dtree = DecisionTreeClassifier()
dtree.fit(X_train_normalized, y_train)

# Evaluate the model
dtree_y_pred = dtree.predict(X_test_normalized)
dtree_accuracy = accuracy_score(y_test, dtree_y_pred)
dtree_f1 = f1_score(y_test,dtree_y_pred)
print('The accuracy score is:', dtree_accuracy)
print('The f1 score is:', dtree_f1)

In [None]:
# To increment number of features, one at each time
acc_scores = []
for i in range(1,33):
    dt = DecisionTreeClassifier()
    rfe_new = RFE(estimator=dt, n_features_to_select=i)
    # Training model
    rfe_new.fit(X_train_normalized, y_train)
    # Predicting on test set
    y_pred = rfe_new.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, y_pred)
    # Print this
    print("Accuracy on test set using", i, "features: ", acc_score)
    # Append to the list
    acc_scores.append(acc_score)

# Estimating accuracy score on test set using RFE by using different number of features
estimator = DecisionTreeClassifier()
acc_scores = []
f1_scores = []
for i in range(1, 33):
    selector = RFE(estimator,n_features_to_select=i)
    selector = selector.fit(X_train_normalized, y_train)
    supp = selector.get_support()

    predicted = selector.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, predicted)
    acc_scores.append(acc_score)
    

best = 1
for item1 in acc_scores:
    if item1 > acc_scores[best - 1]:
        best = acc_scores.index(item1) + 1
plt.grid()
plt.xlabel('# No. of features')
plt.ylabel('Accuracy score on test set')
plt.plot(range(1, 33), acc_scores, marker = 'o', color = 'lightblue', markeredgewidth = 1 ,markeredgecolor = 'lightblue', markerfacecolor = 'None')
plt.plot(best, acc_scores[best-1], marker = 'o', markerfacecolor = 'red')

In [None]:
# Get the number of features that has the maximisation performance
# print("The number of features for best accuracy score:", best)

In [None]:
# Training Logistic Regression model
model = DecisionTreeClassifier()
model.fit(X_train_normalized, y_train)
# Doing predictions on train and test set
y_hat_train = model.predict(X_train_normalized)
y_hat_test = model.predict(X_test_normalized)
# Evaluate the performance of the trained model
print("Accuracy score on training set: ", accuracy_score(y_train, y_hat_train))
print("Accuracy score on testing set: ", accuracy_score(y_test, y_hat_test))
# Checking confusion matrix
print("Confusion matrix on test set: ")
print(confusion_matrix(y_test, y_hat_test))
print("Confusion matrix on train set: ")
print(confusion_matrix(y_train, y_hat_train))

### KNN Classifier Model:

In [85]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [86]:
# Import the KNN classifier
from sklearn.neighbors import KNeighborsClassifier

# Build a KNN classifier model
clf_knn = KNeighborsClassifier(n_neighbors=1)

# Train the model with the training data with new features selected
clf_knn.fit(X_train_normalized, y_train)

In [87]:
y_pred_new = clf_knn.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_pred_new)
print("The accuracy score for testing is:", accuracy)
print("The f1 score for testing is:", f1_score(y_test, y_pred_new))

The accuracy score for testing is: 0.7889699179580675
The f1 score for testing is: 0.6995457495133031


In [88]:
y_pred_train_new = clf_knn.predict(X_train_normalized)
print("The accuracy score for training is:", accuracy_score(y_train, y_pred_train_new))
print("The f1 score for training is:", f1_score(y_train, y_pred_train_new))

The accuracy score for training is: 1.0
The f1 score for training is: 1.0


In [89]:
parameter_grid = {'n_neighbors': range(1,33)}
knn_clf = KNeighborsClassifier()
gs_knn = GridSearchCV(knn_clf, parameter_grid, cv=5, scoring='accuracy')
gs_knn.fit(X_train_normalized, y_train)

print('Best K value: ', gs_knn.best_params_['n_neighbors'])
print('The accuracy: %.4f\n' % gs_knn.best_score_)

Best K value:  15
The accuracy: 0.8129



# Visualise the performance change with respect to K using a line chart
import numpy as np
k_values = gs_knn.cv_results_['param_n_neighbors'].data
mean_test_scores = gs_knn.cv_results_['mean_test_score']

plt.figure(figsize=(10, 6))
plt.plot(k_values, mean_test_scores, marker='o')
plt.title('KNN Hyperparameter Tuning with Grid Search')
plt.xlabel('Number of K')
plt.ylabel('Mean Accuracy')
plt.xticks(np.arange(1, 33, step=1))
plt.grid()
plt.show()

In [91]:
#Get the metrics for the best case:
best_clf = KNeighborsClassifier(n_neighbors = 15)
best_clf.fit(X_train_normalized, y_train)
y_for_pred = best_clf.predict(X_test_normalized)
print("The accuracy score is:", accuracy_score(y_test, y_for_pred))
print("The f1 score is:", f1_score(y_test, y_for_pred))

The accuracy score is: 0.8067456700091158
The f1 score is: 0.6849925705794947


In [92]:
# Run the model with l1 metrics:
knn_l1 = KNeighborsClassifier(n_neighbors=15, metric='l1')
knn_l1.fit(X_train_normalized, y_train)
y_l1 = knn_l1.predict(X_test_normalized)
print("The accuracy score is:", accuracy_score(y_test, y_l1))
print("The f1 score is:", f1_score(y_test, y_l1))

The accuracy score is: 0.8263445761166819
The f1 score is: 0.7233115468409587


In [93]:
# Run the model with cosine metrics:
knn_cosine = KNeighborsClassifier(n_neighbors=15, metric='cosine')
knn_cosine.fit(X_train_normalized, y_train)
y_cosine = knn_cosine.predict(X_test_normalized)
print("The accuracy score is:", accuracy_score(y_test, y_cosine))
print("The f1 score is:", f1_score(y_test, y_cosine))

The accuracy score is: 0.8176845943482224
The f1 score is: 0.7101449275362319


### Random Forest:

In [95]:
# Create and train a Random Forest classifier model
rf = RandomForestClassifier()
rf.fit(X_train_normalized, y_train)

# Evaluate the model
rf_y_pred = rf.predict(X_test_normalized)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
print('The accuracy score is:', rf_accuracy)
print('The f1 score is:', rf_f1)

The accuracy score is: 0.9694621695533272
The f1 score is: 0.9598562013181547


In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

acc_scores = []
rf = RandomForestClassifier()

for i in range(1, 33):
    rfe_new = RFE(estimator=rf, n_features_to_select=i)
    rfe_new.fit(X_train_normalized, y_train)
    y_pred = rfe_new.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, y_pred)
    print(f"Accuracy on test set using {i} features: {acc_score}")
    acc_scores.append(acc_score)


Accuracy on test set using 1 features: 0.7219690063810392
Accuracy on test set using 2 features: 0.8842297174111212
Accuracy on test set using 3 features: 0.9443938012762079
Accuracy on test set using 4 features: 0.96718322698268
Accuracy on test set using 5 features: 0.9685505925250684
Accuracy on test set using 6 features: 0.9726526891522334
Accuracy on test set using 7 features: 0.9717411121239745
Accuracy on test set using 8 features: 0.9726526891522334
Accuracy on test set using 9 features: 0.9731084776663628
Accuracy on test set using 10 features: 0.9726526891522334
Accuracy on test set using 11 features: 0.9721969006381039
Accuracy on test set using 12 features: 0.9721969006381039
Accuracy on test set using 13 features: 0.9721969006381039
Accuracy on test set using 14 features: 0.9717411121239745
Accuracy on test set using 15 features: 0.9731084776663628
Accuracy on test set using 16 features: 0.9726526891522334
Accuracy on test set using 17 features: 0.9721969006381039
Accuracy

# Estimating accuracy score on test set using RFE by using different number of features
estimator = RandomForestClassifier()
acc_scores = []
for i in range(1, 33):
    selector = RFE(estimator,n_features_to_select=i)
    selector = selector.fit(X_train_normalized, y_train)
    supp = selector.get_support()

    predicted = selector.predict(X_test_normalized)
    acc_score = accuracy_score(y_test, predicted)
    acc_scores.append(acc_score)
    

best = 1
for item1 in acc_scores:
    if item1 > acc_scores[best - 1]:
        best = acc_scores.index(item1) + 1
plt.grid()
plt.xlabel('# No. of features')
plt.ylabel('Accuracy score on test set')
plt.plot(range(1, 33), acc_scores, marker = 'o', color = 'lightblue', markeredgewidth = 1 ,markeredgecolor = 'lightblue', markerfacecolor = 'None')
plt.plot(best, acc_scores[best-1], marker = 'o', markerfacecolor = 'red')
print("The best number of features for accuracy score is:", best)

In [98]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train_normalized, y_train)
xgb_y_pred = xgb_clf.predict(X_test_normalized)

# Evaluate the model: 
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred, average='weighted')

print('The Accuracy:', xgb_accuracy)
print('The F1 Score:', xgb_f1)

The Accuracy: 0.9594348222424794
The F1 Score: 0.9593800823475592


In [99]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid Search to tune the parameter 
grid_search = GridSearchCV(xgb.XGBClassifier(random_state=42), param_grid, 
                           scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train_normalized, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_normalized)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred))


Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 1.0}
Best Model Accuracy: 0.9721969006381039


In [100]:
import streamlit as st
import shap 

In [101]:
st.write("""
# Fraud Transaction Detection App

This app helps to detect illegal transaction
""")
st.write('---')


2025-03-18 11:18:20.866 
  command:

    streamlit run /Users/trangnguyen/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [102]:
# Sidebar
# Header of Specify Input Parameters
st.sidebar.header('Specify Input Parameters')



DeltaGenerator(_root_container=1, _parent=DeltaGenerator())

In [103]:
import streamlit as st
import pandas as pd

def user_input_features():
    st.sidebar.header("User Input Features")

    # Numerical Inputs
    Age = st.sidebar.slider('Age', min_value=0, max_value=100, value=10)
    NumDependents = st.sidebar.slider('Number of Dependents', min_value=0, max_value=10, value=1)
    UserTenure = st.sidebar.slider('User Tenure (Months)', min_value=0, max_value=240, value=12)
    Income = st.sidebar.slider('User Income', min_value=0, max_value=1000000, value=20000)
    Expenditure = st.sidebar.slider('Expenditure', min_value=0, max_value=1000000, value=20000)

    # Categorical Inputs (Using selectbox)
    Gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other'])
    Occupation = st.sidebar.selectbox('Occupation', ['Student', 'Professional', 'Self-Employed', 'Retired'])
    EducationLevel = st.sidebar.selectbox('Education Level', ['High School', 'Bachelors', 'Masters', 'PhD'])
    MaritalStatus = st.sidebar.selectbox('Marital Status', ['Single', 'Married', 'Divorced', 'Widowed'])
    TransactionType = st.sidebar.selectbox('Transaction Type', ['Online', 'In-store'])
    DeviceType = st.sidebar.selectbox('Device Type', ['Mobile', 'Desktop', 'Tablet'])
    # Transaction Amount 
    TransactionAmount = st.sidebar.number_input('Transaction Amount ($)', min_value=0.0, max_value=10000.0, value=50.0)

    # Transaction Date 
    TransactionDate = st.sidebar.date_input('Transaction Date')

    # Terrorism Flag 
    Terrorism = st.sidebar.checkbox('Terrorism Or Not')

    # Latitude & Longitude (Handling missing values)
    Latitude = st.sidebar.number_input('Latitude', value=0.0, format="%.6f")
    Longitude = st.sidebar.number_input('Longitude', value=0.0, format="%.6f")

    # Create DataFrame
    data = {
        'Age': Age,
        'NumDependents': NumDependents,
        'UserTenure': UserTenure,
        'Gender': Gender,
        'Occupation': Occupation,
        'EducationLevel': EducationLevel,
        'MaritalStatus': MaritalStatus,
        'Income': Income,
        'Expenditure': Expenditure,
        'TransactionType': TransactionType,
        'DeviceType': DeviceType,
        'TransactionAmount': TransactionAmount,
        'TransactionDate': str(TransactionDate),
        'Terrorism': int(Terrorism),
        'Latitude': Latitude,
        'Longitude': Longitude
    }

    # Convert to DataFrame
    return pd.DataFrame(data, index=[0])


# Run function and display user inputs
input_features = user_input_features()
st.write("User Input Features:", input_features)

2025-03-18 11:18:20.881 Session state does not function when running a script without `streamlit run`


In [None]:
import streamlit as st
!streamlit run project.py --server.enableCORS false --server.enableXsrfProtection false

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://10.126.226.36:8501[0m
[0m
2025-03-18 11:18:23.240 Uncaught app execution
Traceback (most recent call last):
  File "/Users/trangnguyen/anaconda3/lib/python3.11/site-packages/streamlit/runtime/scriptrunner/exec_code.py", line 121, in exec_func_with_error_handling
    result = func()
             ^^^^^^
  File "/Users/trangnguyen/anaconda3/lib/python3.11/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 593, in code_to_exec
    exec(code, module.__dict__)
  File "/Users/trangnguyen/Documents/GitHub/PersonalPrj/project.py", line 21, in <module>
    get_ipython().run_line_magic('matplotlib', 'inline')
    ^^^^^^^^^^^
NameError: name 'get_ipython' is not defined
