In [None]:
pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/FraudTrain.csv')

In [None]:
data.head()

In [None]:
data.isnull()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data['Index'].value_counts()

In [None]:
data.drop(data.columns[:1], axis=1, inplace=True)
data.head()

In [None]:
#converting trans_date_trans_time into datetime
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
print(data.dtypes['trans_date_trans_time'])
data.head()

In [None]:
#dtypes of the columns
data.dtypes

In [None]:
#Number of unique values in the dataset
data.nunique()

In [None]:
# deriving additonal columns from 'trans_date_trans_time'
#deriving hour
data['trans_hour'] = data['trans_date_trans_time'].dt.hour
#deriving 'day of the week'
data['trans_day_of_week'] = data['trans_date_trans_time'].dt.dayofweek + 1
data['trans_day_of_week'] = data['trans_day_of_week'].astype(int)
#deriving 'year_month'
data['trans_year_month'] = data['trans_date_trans_time'].dt.to_period('M')

data.head(10)

In [None]:
#Getting the Age of a customer from the D.O.B column
data['dob'] = pd.to_datetime(data['dob'])
data['age'] = ((data['trans_date_trans_time'] - data['dob']).dt.days / 365.25).astype(int)

data['age'].head()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
#converting data types that should be categorical into "category"

data['category'] = data['category'].astype('category')
data['gender'] = data['gender'].astype('category')
data['is_fraud'] = data['is_fraud'].astype('category')

data.info()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
#check the percentage of fraudulent data points in our dataset
100 * data.groupby('is_fraud').size() / len(data)

In [None]:
#looking at distribution of amount
pd.concat(
    [data['amt'].describe(percentiles = [0.5,0.95,0.999]).reset_index().rename(columns={'index': 'Row Type', 'amt':'Total Amount Distribution'}),
     data.loc[data['is_fraud']==0,['amt']].describe(percentiles = [0.5,0.95,0.999]).reset_index(drop = 1).rename(columns={'amt':'Non-Fraud Amount Distribution'}),
     data.loc[data['is_fraud']==1,['amt']].describe(percentiles = [0.5,0.95,0.999]).reset_index(drop = 1).rename(columns={'amt':'Fraud Amount Distribution'})], axis=1)

In [None]:
#plotting the above distributions
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.subplots(figsize=(15,10))

plots = []
#plotting the amt feature

#distribution plots
plots.append(sns.histplot(data[data.amt <= 1500].amt, bins=50, ax=plt.subplot(234)))
plots.append(sns.histplot(data[(data.is_fraud==0) & (data.amt<=1500)].amt, bins=50, ax=plt.subplot(235)))
plots.append(sns.histplot(data[(data.is_fraud==1) & (data.amt<=1500)].amt, bins=50, ax=plt.subplot(236)))

#setting titles
plots[0].set_title('Overall Amount Distribution')
plots[1].set_title('Non Fraud Amount Distribution')
plots[2].set_title('Fraud Amount Distribution')

#setting x labels
plots[0].set_xlabel('Transaction Amount')
plots[1].set_xlabel('Transaction Amount')
plots[2].set_xlabel('Transaction Amount')

#setting y label
plots[0].set_ylabel('Number of transactions')

plt.show()

In [None]:
#year_month vs number of transactions
df_timeline01 = data.groupby(data['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline01.columns = ['year_month','num_of_transactions','customers']
df_timeline01

In [None]:
data_transactions = data[data['is_fraud']==1]

df_timeline02 = data_transactions.groupby(data_transactions['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline02.columns = ['year_month','num_of_fraud_transactions','fraud_customers']
df_timeline02

In [None]:
gender_dist = data['gender'].value_counts()
print(gender_dist)

In [None]:
gender_fraud_dist = data.groupby('gender')['is_fraud'].value_counts(normalize=True).unstack()
print(gender_fraud_dist)


In [None]:
# Plotting the Distribution
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# Gender distribution
sns.countplot(x='gender', data=data, ax=axs[0])
axs[0].set_title('Gender Distribution')
axs[0].set_xlabel('Gender')
axs[0].set_ylabel('Count')

# Gender-fraud distribution
gender_fraud_dist.plot(kind='bar', stacked=True, ax=axs[1])
axs[1].set_title('Gender-Fraud Distribution')
axs[1].set_xlabel('Gender')
axs[1].set_ylabel('Proportion')
axs[1].legend(title='Is Fraud')

plt.tight_layout()
plt.show()


In [None]:
bins = [12, 19, 32, 42, 50,62 , float('inf')]
custom_labels = ['Teenagers', 'Young Adults', 'Adults', 'Middle-aged', 'Seniors', 'Retired']

# Apply the binning to create a new 'age_category' column
data['age_category'] = pd.cut(data['age'], bins=bins, labels=custom_labels, right=False)

# Display the result
print(data[['age', 'age_category']].tail())

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

sns.countplot(x='age_category', data=data, ax=ax)
ax.set_title('Age Category Distribution')
ax.set_xlabel('Age Category')
ax.set_ylabel('Count')

plt.tight_layout()
plt.show()



In [None]:
# Group the data and calculate counts
grouped_data = data.groupby(['age_category', 'is_fraud'])['age'].count().unstack()
# Plot the bar chart
grouped_data.plot(kind='bar', figsize=(10, 6))

# Add labels and title
plt.xlabel('Age Category')
plt.ylabel('Count')
plt.title('Distribution of Age Categories by Fraud Status')
plt.xticks(rotation=0)
plt.legend(title='Is Fraud')

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Set up the plot
plt.figure(figsize=(12, 6))

# Plotting the count of individuals in each age category for both fraudulent and non-fraudulent transactions
ax = sns.countplot(x='age_category', hue='is_fraud', data=data, order=custom_labels, palette='viridis')

# Adding labels and title
plt.xlabel('Age Category')
plt.ylabel('Count')
plt.title('Distribution of Age Categories for Fraudulent and Non-Fraudulent Transactions')



# Adding the ratio of fraudulent transactions for each category
for category in custom_labels:
    total_count = data[data['age_category'] == category].shape[0]
    fraud_count = data[(data['age_category'] == category) & (data['is_fraud'] == 1)].shape[0]
    ratio = fraud_count / total_count if total_count > 0 else 0
    ax.text(custom_labels.index(category), total_count, f'Fraud Ratio: {ratio:.2%}',
            ha='center', va='bottom', fontsize=10, color='red')

# Show the plot
plt.show()


In [None]:
data.columns

In [None]:
# One-hot encoding categorical variables
data = pd.get_dummies(data, columns=['category', 'gender'])

In [None]:
# Drop columns that won't be used for modeling
data.drop(columns=['first', 'last', 'street', 'city', 'state', 'zip', 'lat', 'long', 'job', 'unix_time', 'merch_lat', 'merch_long'], inplace=True)

In [None]:
# Splitting data into features and target
X = data.drop(columns=['is_fraud'])
y = data['is_fraud'].astype('int')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Convert datetime features to numerical representation
# Extract numerical features from datetime
# Example for 'trans_date_trans_time' column:
# Check if the column exists before processing
if 'trans_date_trans_time' in X_train.columns:
    X_train['trans_date_trans_time_numeric'] = X_train['trans_date_trans_time'].astype('int64') // 10**9  # Convert to Unix timestamp
    X_test['trans_date_trans_time_numeric'] = X_test['trans_date_trans_time'].astype('int64') // 10**9

    # Drop original datetime column
    X_train.drop(columns=['trans_date_trans_time'], inplace=True)
    X_test.drop(columns=['trans_date_trans_time'], inplace=True)
else:
    print("Column 'trans_date_trans_time' not found. Skipping this step.")

# Identify and drop non-numerical columns before scaling
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns
X_train = X_train.drop(columns=non_numeric_columns)
X_test = X_test.drop(columns=non_numeric_columns)

# Now scale the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# Logistic Regression with balanced classes
log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_resampled, y_train_resampled)
y_pred_log_reg = log_reg.predict(X_test)

print("Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(f"Precision: {precision_score(y_test, y_pred_log_reg)}")
print(f"Recall: {recall_score(y_test, y_pred_log_reg)}")
print(f"F1 Score: {f1_score(y_test, y_pred_log_reg)}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


y_pred_log_reg = log_reg.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_log_reg)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()


In [None]:
# Decision Tree Classiifer with balanced classes
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(class_weight='balanced')
dt_clf.fit(X_train_resampled, y_train_resampled)
y_pred_dt_clf = dt_clf.predict(X_test)

print("Decision Tree Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt_clf)}")
print(f"Precision: {precision_score(y_test, y_pred_dt_clf)}")
print(f"Recall: {recall_score(y_test, y_pred_dt_clf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_dt_clf)}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred_dt_clf = dt_clf.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_dt_clf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Decision Tree Classifier')
plt.show()


In [None]:
# Random Forest Classifier with balanced classes
rf_clf = RandomForestClassifier(class_weight='balanced')
rf_clf.fit(X_train_resampled, y_train_resampled)
y_pred_rf_clf = rf_clf.predict(X_test)

print("Random Forest Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_clf)}")
print(f"Precision: {precision_score(y_test, y_pred_rf_clf)}")
print(f"Recall: {recall_score(y_test, y_pred_rf_clf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf_clf)}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred_rf_clf = rf_clf.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_clf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()


In [None]:
# XGBoost Classifier with balanced classes
xgb_clf = XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = xgb_clf.predict(X_test)

print("XGBoost Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")
print(f"Precision: {precision_score(y_test, y_pred_xgb)}")
print(f"Recall: {recall_score(y_test, y_pred_xgb)}")
print(f"F1 Score: {f1_score(y_test, y_pred_xgb)}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred_xgb = xgb_clf.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_xgb)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XGBoost Classifier')
plt.show()


In [None]:
from sklearn.linear_model import SGDClassifier

# SGDClassifier with hinge loss, which acts as a linear SVM
sgd_clf = SGDClassifier(loss='hinge', class_weight='balanced', random_state=42, max_iter=1000)
sgd_clf.fit(X_train_resampled, y_train_resampled)
y_pred_sgd = sgd_clf.predict(X_test)

print("SGDClassifier (Linear SVM)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_sgd)}")
print(f"Precision: {precision_score(y_test, y_pred_sgd)}")
print(f"Recall: {recall_score(y_test, y_pred_sgd)}")
print(f"F1 Score: {f1_score(y_test, y_pred_sgd)}")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred_sgd_clf = sgd_clf.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_sgd)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Stochastic Gradient Descent SVM Classifier')
plt.show()


In [None]:
from sklearn.linear_model import SGDClassifier


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a smaller set of base models for quicker testing
estimators = [
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
    ('xgb', XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), random_state=42)),
    ('sgd', SGDClassifier(loss='log_loss', class_weight='balanced', random_state=42))  # Fixed loss parameter
]

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Time the fitting process
import time
start_time = time.time()

# Fit the stacking model
stacking_clf.fit(X_train_resampled, y_train_resampled)

end_time = time.time()
print(f"Fitting time: {end_time - start_time} seconds")

# Predict and evaluate the stacking model
y_pred_stacking = stacking_clf.predict(X_test)

print("Stacking Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stacking)}")
print(f"Precision: {precision_score(y_test, y_pred_stacking)}")
print(f"Recall: {recall_score(y_test, y_pred_stacking)}")
print(f"F1 Score: {f1_score(y_test, y_pred_stacking)}")


In [None]:
!pip install shap


In [None]:
!pip install lime


In [None]:
!pip install optuna

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import shap
from lime.lime_tabular import LimeTabularExplainer
import optuna


# Sample feature engineering
data['amt_age_interaction'] = data['amt'] * data['age']

# Apply SMOTE for balancing the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature Importance-based Feature Selection
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_resampled, y_train_resampled)
importances = rf_clf.feature_importances_
indices = np.argsort(importances)[::-1]
top_features = X.columns[indices[:10]]
X_train_resampled = X_train_resampled[:, indices[:10]]
X_test = X_test[:, indices[:10]]

# Define base models
estimators = [
    ('rf', RandomForestClassifier()),
    ('xgb', XGBClassifier()),
    ('svc', SVC())
]

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Fit the stacking model
stacking_clf.fit(X_train_resampled, y_train_resampled)
y_pred_stack = stacking_clf.predict(X_test)

# Evaluate the stacking model
print("Stacking Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack)}")
print(f"Precision: {precision_score(y_test, y_pred_stack)}")
print(f"Recall: {recall_score(y_test, y_pred_stack)}")
print(f"F1 Score: {f1_score(y_test, y_pred_stack)}")
print("Classification Report")
print(classification_report(y_test, y_pred_stack))

# Model Interpretability with SHAP
explainer = shap.Explainer(stacking_clf, X_train_resampled)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)

# Model Interpretability with LIME
explainer = LimeTabularExplainer(X_train_resampled, feature_names=top_features, class_names=['Not Fraud', 'Fraud'], mode='classification')
idx = 0  # Example index to explain
exp = explainer.explain_instance(X_test[idx], stacking_clf.predict_proba)
exp.show_in_notebook()

# Hyperparameter Tuning with Optuna
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'objective': 'binary:logistic'
    }
    model = XGBClassifier(**param)
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)
    return 1 - f1_score(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print(f"Best parameters: {study.best_params}")

# Additional ensemble techniques: Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('xgb', XGBClassifier())
], voting='soft')

voting_clf.fit(X_train_resampled, y_train_resampled)
y_pred_voting = voting_clf.predict(X_test)

print("Voting Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting)}")
print(f"Precision: {precision_score(y_test, y_pred_voting)}")
print(f"Recall: {recall_score(y_test, y_pred_voting)}")
print(f"F1 Score: {f1_score(y_test, y_pred_voting)}")
print("Classification Report")
print(classification_report(y_test, y_pred_voting))


In [None]:
# from sklearn.svm import LinearSVC
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# # Linear SVM Classifier with balanced classes
# linear_svm_clf = LinearSVC(class_weight='balanced', max_iter=10000)
# linear_svm_clf.fit(X_train_resampled, y_train_resampled)
# y_pred_linear_svm = linear_svm_clf.predict(X_test)

# print("Linear SVM Classifier")
# print(f"Accuracy: {accuracy_score(y_test, y_pred_linear_svm)}")
# print(f"Precision: {precision_score(y_test, y_pred_linear_svm)}")
# print(f"Recall: {recall_score(y_test, y_pred_linear_svm)}")
# print(f"F1 Score: {f1_score(y_test, y_pred_linear_svm)}")


In [None]:
# # Support Vector Machine Classifier with balanced classes
# svm_clf = SVC(class_weight='balanced')
# svm_clf.fit(X_train_resampled, y_train_resampled)
# y_pred_svm = svm_clf.predict(X_test)

# print("SVM Classifier")
# print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
# print(f"Precision: {precision_score(y_test, y_pred_svm)}")
# print(f"Recall: {recall_score(y_test, y_pred_svm)}")
# print(f"F1 Score: {f1_score(y_test, y_pred_svm)}")