# **Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib import style
import datetime
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

sns.color_palette("pastel")
sns.set_palette("pastel")
#sns.set_style("whitegrid")

# **Load dataset**

In [None]:
df = pd.read_csv("/kaggle/input/credit-card-fraud-prediction/fraud test.csv", sep=",")

In [None]:
df.sample(5)

# **Overview of dataset**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['is_fraud'].value_counts()

In [None]:
df['job'].value_counts()

In [None]:
df['merchant'].value_counts()

In [None]:
df.nunique()

# **Data Cleaning**

In [None]:
# Check for duplicate
print(len(df[df.duplicated()]))

In [None]:
# Check for null values
df.isnull().sum()

In [None]:
# rename the column
df.rename(columns={"Unnamed: 0":"id"}, inplace=True)

In [None]:
df['merchant'] = df['merchant'].str.replace("fraud_", "")

In [None]:
# Split trans_date_trans_time
df[['trans_date', 'trans_time']] = df['trans_date_trans_time'].str.split(' ', expand=True)
df.head(5)

In [None]:
# Conver to date 
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_date'] = pd.to_datetime(df['trans_date'])
df['trans_time'] = pd.to_datetime(df['trans_time'], format= '%H:%M').dt.time
df['trans_time_group'] = df['trans_date_trans_time'].dt.hour
df['trans_month'] = df['trans_date'].dt.to_period('M').astype("str")
df['trans_dayOfWeek'] = df['trans_date'].dt.day_name()


In [None]:
#calculate age
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (2020  - df['dob'].dt.year)

In [None]:
df['age'].unique()

In [None]:
print("Min age", df['age'].min())
print("Max age", df['age'].max())

In [None]:
def apply_age_group(age):
    if(age <= 18):
        return 'Teenager'
    elif (age <= 25):
        return "Young Adult"
    elif (age <= 64):
        return "Adult"
    else:
        return "Elder"

In [None]:
df['age_group'] = df['age'].apply(apply_age_group)

In [None]:
df['is_not_fraud']= df['is_fraud'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
df.info()

In [None]:
# Drop unnecessary columns
df = df.drop(['street', 'zip', 'city_pop', 'trans_num', 'unix_time', 'merch_lat', 'merch_long','first','last','dob','lat','long'], axis=1)

In [None]:
df.sample(5)

# **Analysis**

In [None]:
def annotate_bar(ax, custom_y_func, font_size = 14):
    for p in ax.patches:
        # Calculate annotation
        value = str(round(p.get_height(), 1))
        x = (p.get_x() + p.get_width() / 2) * 0.99
        y = ((p.get_y() + p.get_height() / 2) * 0.99)
        
        y = custom_y_func(y)
        ax.annotate(
            value,
            (x,y),
            color="black",
            size= font_size, ha='center', va='center'
        )

In [None]:
fig = plt.figure(figsize=(30, 5))

plt.subplot(1,2,1)
df_fraud_count = df['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud').value_counts().reset_index()
ax = sns.barplot(data = df_fraud_count, x = 'is_fraud', y='count', color='#c6def8')
#ax = df_fraud_count.plot(kind="bar", figsize=(15,5), title='Number of fraud and not fraud transaction')

# because no of fraud case to too small we set y to a fixed value to display in the chart 
annotate_bar(ax, lambda y: 15000 if y < 10000 else y, font_size = 14,)
plt.title('Total number of transaction for fraud and not fraud transaction',fontsize=12,fontweight='bold')
plt.ylabel("Transaction count")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)

df_fraud_amount = df.groupby('is_fraud')['amt'].sum().reset_index()

plt.subplot(1,2,2)
ax = sns.barplot(data = df_fraud_amount, x = 'is_fraud', y='amt', color='#c6def8')
annotate_bar(ax, lambda y: 1900000 if y < 1200000 else y, font_size = 12)

plt.title('Total transaction amount for fraud and not fraud transaction',fontsize=12,fontweight='bold')
plt.ylabel("Transaction amount")
ax.set_xticklabels(['Not Fraud','Fraud'], rotation = 0)
plt.show()

There's a total of 555719 transactions. In those transactions, there are 553574 valid transaction which make up 99.61%. The remaining 0.39% is the fraud cases.

In [None]:
# Calculate the average transaction amount for fraudulent and non-fraudulent transactions
fig = plt.figure(figsize=(30, 5))
avg_amt = df.groupby('is_fraud')['amt'].mean().reset_index()
ax = sns.barplot(data = avg_amt, x = 'is_fraud', y='amt', color='#c6def8')
annotate_bar(ax, lambda y: y, font_size = 12)
plt.title('Average transaction amount for fraud and not fraud transaction',fontsize=12,fontweight='bold')
plt.ylabel("Transaction amount")
ax.set_xticklabels(['Not Fraud','Fraud'], rotation = 0)
plt.show()

In [None]:
#ploting the overview of dataset by month, gender and category
columns = ['trans_month','trans_dayOfWeek','gender','category','age','age_group']
columns_name = ['month','day of week','gender','category','age','age group']
name = ['Not Fraud','Fraud']

df['fraud'] = df['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud')

index = 0
for col in columns:
    fig = plt.figure(figsize=(30, 5))
    plt.suptitle("Distribution of transaction by " + columns_name[index],fontsize=20,fontweight='bold')
    
    for i in range(0,2):
        plt.subplot(1,2,1+i)
        df_1 = df[df['is_fraud'] == i]
        if (col == 'trans_month'):
            ax = df_1.groupby(col)['amt'].sum().plot(kind='bar',label='Count')
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
            ax.set_ylabel('Count')
            
            plt.twinx()
            ax1 = df_1.groupby(col).size().plot(kind='line',color='orange', label='Amount')
            ax1.set_xticklabels(ax.get_xticklabels(), rotation = 0)
            ax1.set_ylabel('Amount')   
            
            chart, labels = ax.get_legend_handles_labels()
            chart1, labels1 = ax1.get_legend_handles_labels()
            ax1.legend(chart + chart1, labels + labels1, loc=0)
        elif (col == 'gender'):
            ax = plt.pie(df_1[col].value_counts(), labels = ['Female','Male'] , autopct='%1.1f%%')
        elif (col == 'age_group'):
            ax = plt.pie(df_1[col].value_counts(), labels = df_1[col].value_counts().index , autopct='%1.1f%%')
        elif (col == 'trans_dayOfWeek'):
            cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            ax = sns.barplot(data = df_1.groupby(col).size().reset_index(), x = col, y=0, label = 'Count'
                            , color='#a1c9f4', order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
            ax.set_ylabel('Count')
            
            plt.twinx()
            
            ax1 = sns.lineplot(data = df_1.groupby(col)['amt'].sum().reindex(cats).reset_index(), x = col, y='amt', label ='Amount'
                            , color='orange')
            ax1.set_ylabel('Amount')
            
            chart, labels = ax.get_legend_handles_labels()
            chart1, labels1 = ax1.get_legend_handles_labels()
            ax1.legend(chart + chart1, labels + labels1, loc=0)
        elif (col == 'category'):
            ax = sns.countplot(data=df_1, y=col, order = df_1[col].value_counts().index)
        else:
            ax = sns.histplot(data=df_1, x=col)
        plt.title(name[i])
        plt.xlabel(columns_name[index])
    
        if col == 'category':
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
    index +=1
    plt.show()

**By months**
* For 'Not Fraud' cases, 06/2020 is the month with the lowest number of transactions. In the following months, the number of transactions increased and remained stable,reaching the highest level in December. 
    * For 12/2020, the different is significant when comparing to previous month. That is quite understandable because December has an important holiday, Christmas. Moreover, consumer demand also often increases  at the end of the year,.
* For Fraud case, July is still the lowest month then steadily increases as time goes by, reaching its peak in August. After reaching its peak, number of fraud transaction graudually over the following months.
    * From Aug to Oct, we have high number of fraud transactions. 
    
**Day of week**
* The 2 charts share a similar pattern that Sunday, Monday, Tuesday have the most number of transaction for both fraud and not fraud. This indicates we should pay more attentions to transactions happen on these at as they are more likely to be fraud.

**By gender**
* Based on the observation, in both types of transactions female have more transaction when compare to male. Though, the gap is not that significant.

**By category**
* For 'Not Fraud' cases, the top 3 categories are gas_transport, grocery_pos, home, with gas_transport is the highest
* For 'Fraud' cases, the top 3 categories are grocery_pos, shopping_net, and misc_net
* grocery_pos is an category we should keep an eyes on

**By age**
* For both Fraud and Not Fraud, the distribution is quite similar with Adult (26-65) have the most transaction, while Teenager have the least.
* This is quite predictable as teenager don't have a stable income yet and mainly depend on family support
* Adult is still in prime working age


In [None]:
name = ['Not Fraud','Fraud']

fig = plt.figure(figsize=(30, 5))
plt.suptitle("Distribution of transaction by time",fontsize=20,fontweight='bold')
for i in range(0,2):
    plt.subplot(1,2,1+i)
    temp = df[df['is_fraud'] == i].groupby('trans_time_group').size().reset_index().sort_values(by='trans_time_group')
    ax = sns.lineplot(data=temp, x="trans_time_group", y=0)
    plt.title(name[i])
    plt.xlabel('Time')
    ax.set(xticks=df['trans_time_group'].unique())
    #ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)
plt.show()
    

From 0h-11h, number of transaction maintain at a stable rate \
From 11h - 24 is the period where people actively make transaction, hence the significant jump compare to pevious period. \
For fraud transaction, majority of the cases happen at late at night (22-24h) or early time of the date (0-4h), which implies people with bad intention will more likely to act at a time with low human monitoring.

In [None]:
#By Transaction count
columns = ['job','state','city','merchant']
columns_name = ['job','state','city','merchant']
fraud = ['Not Fraud','Fraud']
y = 0 
for col in columns:
    sns.set_palette("pastel")
    name = columns_name[y]
    y += 1
    fig = plt.figure(figsize=(30, 5))
    if (col == 'trans_dayOfWeek'):
        plt.suptitle("Top transaction by " + name, fontsize=20,fontweight="bold")
    else:
        plt.suptitle("Top 10 transaction by " + name, fontsize=20,fontweight="bold")   
    for i in range(0,len(fraud)):
        temp_df = df[df['is_fraud'] == i]
        top = temp_df.groupby(col).size().nlargest(10)
        plt.subplot(1,2,1+i)
        ax = sns.barplot(data = top.reset_index(), x = col, y=0, color='#a1c9f4')
        if (col == 'state'):
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
        else:
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
        plt.ylabel('Count')
        plt.xlabel(name)
        #plt.axhline(temp_df.groupby(col).size().mean(), color='red', linestyle='--')
        plt.title(fraud[i])
    plt.show()

In [None]:
#TOP Transaction Amount
columns = ['job','state','city','merchant']
columns_name = ['job','state','city','merchant']
fraud = ['Not Fraud','Fraud']
y = 0 
for col in columns:
    name = columns_name[y]
    y += 1
    fig = plt.figure(figsize=(30, 5))
    plt.suptitle("Top 10 transaction amount by " + name, fontsize=20,fontweight="bold")
    
    for i in range(0,len(fraud)):
        temp_df = df[df['is_fraud'] == i]
        top = temp_df.groupby(col)['amt'].sum().nlargest(10)
        plt.subplot(1,2,1+i)
        ax = sns.barplot(data = top.reset_index(), x = col, y='amt', color='#a1c9f4')
        if (col == 'state'):
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
        else:
            ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
        plt.ylabel('Amount')
        plt.xlabel(name)
        plt.title(fraud[i])
    plt.show()

When observing 'Top 10 transaction' and 'Top 10 transaction amount' charts, we can see that that share similar pattern. High number of transaction also have high number of amount

**Analysis on potential suspicious transaction**

In [None]:
df[df['age'] >= 80]['is_fraud'].value_counts()

In [None]:
df_over80 = df[(df['age'] >= 80) & (df['is_fraud'] == 0)].sort_values(by="trans_time", ascending=False)
df_over80

In [None]:
df_over80['category'].value_counts()

In [None]:
df_over80_category = df_over80.pivot_table(index="trans_time_group",columns="category",aggfunc='count').fillna(0)

In [None]:
df_over80['trans_time_group'].value_counts()

In [None]:
fig = plt.figure(figsize=(30, 5))
plt.suptitle("Distribution of transaction by time",fontsize=20,fontweight='bold')
temp = df_over80.groupby('trans_time_group').size().reset_index().sort_values(by='trans_time_group')
ax = sns.lineplot(data=temp, x="trans_time_group", y=0)
ax.set(xticks=temp['trans_time_group'].unique())
plt.xlabel('Time')
plt.show()

It's quite unusual for elder >80 to make transaction at around 0-2h or 22-23h. We should consider taking a deeper look a these.

# **Correlation**

In [None]:
import scipy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

df.drop(columns=['trans_date','trans_time','age_group','fraud'],inplace=True)
df_train = df.copy()

le = LabelEncoder()
for columns in df.columns:
    if df_train[columns].dtype == 'object':
        df_train[columns] = le.fit_transform(df_train[columns])

In [None]:
plt.figure(figsize=(20,6))
sns.heatmap(df_train.corr(),annot=True)
plt.show()

# Training model

In [None]:
#Select X an y
model_features = ["gender","job",'category',"merchant", "state","age"]
target = ["is_fraud"]

X = df_train[model_features]
y = df_train[target]

#Build model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# As the dataset is consist of huge number for non fraud cases compare to the fraud cases
# Balance the dataset using SMOTE
smote = SMOTE(sampling_strategy='auto')
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_train = X_train_resampled
y_train = y_train_resampled

#standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
# Model training
lr = LogisticRegression()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=5,metric='euclidean')
rfc = RandomForestClassifier()


lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
knn.fit(X_train, y_train)
rfc.fit(X_train, y_train)


In [None]:
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)

y_train_pred_dt = dt.predict(X_train)
y_test_pred_dt = dt.predict(X_test)

y_train_pred_knn = knn.predict(X_train)
y_test_pred_knn = knn.predict(X_test)

y_train_pred_rfc = rfc.predict(X_train)
y_test_pred_rfc = rfc.predict(X_test)

In [None]:
accuracy_logistic = accuracy_score(y_test, y_test_pred_lr)
print("Accuracy of Logistic Regression:", accuracy_logistic)

accuracy_logistic = accuracy_score(y_test, y_test_pred_dt)
print("Accuracy of DecisionTreeClassifier:", accuracy_logistic)

accuracy_logistic = accuracy_score(y_test, y_test_pred_knn)
print("Accuracy of KNeighborsClassifier:", accuracy_logistic)

accuracy_logistic = accuracy_score(y_test, y_test_pred_rfc)
print("Accuracy of RandomForestClassifier:", accuracy_logistic)

In [None]:
# performance on train set
print("Logistic Regression")
print(classification_report(y_train, y_train_pred_lr))  

print("DecisionTreeClassifier")
print(classification_report(y_train, y_train_pred_dt))  

print("KNeighborsClassifier")
print(classification_report(y_train, y_train_pred_knn))

print("RandomForestClassifier")
print(classification_report(y_train, y_train_pred_rfc))  

In [None]:
fig, axes = plt.subplots(2,2, figsize=(20,20))

data = confusion_matrix(y_test, y_test_pred_lr)
df_cm = pd.DataFrame(
    data,
    columns=["Not Fraud", "Fraud"],
    index=["Not Fraud", " Fraud"],
)
df_cm.index.name = "Actual"
df_cm.columns.name = "Predicted"
ax1 = sns.heatmap(df_cm, cmap="YlGnBu", annot=True, fmt="g",ax=axes[0,0]);
ax1.title.set_text("Logistic Regression")

data = confusion_matrix(y_test, y_test_pred_dt)
df_cm = pd.DataFrame(
    data,
    columns=["Not Fraud", "Fraud"],
    index=["Not Fraud", " Fraud"],
)
df_cm.index.name = "Actual"
df_cm.columns.name = "Predicted"
ax2 = sns.heatmap(df_cm, cmap="YlGnBu", annot=True, fmt="g",ax=axes[0,1]);
ax2.title.set_text("Decision Tree Classifier")


data = confusion_matrix(y_test, y_test_pred_knn)
df_cm = pd.DataFrame(
    data,
    columns=["Not Fraud", "Fraud"],
    index=["Not Fraud", " Fraud"],
)
df_cm.index.name = "Actual"
df_cm.columns.name = "Predicted"
ax3 = sns.heatmap(df_cm, cmap="YlGnBu", annot=True, fmt="g",ax=axes[1,0]);
ax3.title.set_text("K-Nearest Neighbors Classifier")


data = confusion_matrix(y_test, y_test_pred_rfc)
df_cm = pd.DataFrame(
    data,
    columns=["Not Fraud", "Fraud"],
    index=["Not Fraud", " Fraud"],
)
df_cm.index.name = "Actual"
df_cm.columns.name = "Predicted"
ax4= sns.heatmap(df_cm, cmap="YlGnBu", annot=True, fmt="g",ax=axes[1,1]);
ax4.title.set_text("RandomForest Classifier")

plt.show