In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [4]:
df = pd.read_csv("../input/fraudulent-transactions-prediction/Fraud.csv")
df.head()


In [6]:
df.shape

In [30]:
# chart for target class

plt.figure(figsize=(8,6))

plt.subplot(121)
ax = sns.countplot(data=df,x='isFraud')
ax.bar_label(ax.containers[0])

plt.subplot(122)
ax = sns.countplot(data=df,x='isFlaggedFraud')
ax.bar_label(ax.containers[0])

plt.suptitle('Count plot of isFraud and isFlaggedFraud')
plt.show()

In [7]:
# count of the type feature

plt.figure(figsize=(8,6))
ax = sns.countplot(data=df ,x='type',hue='isFraud')
for i in range(len(ax.containers)):
    ax.bar_label(ax.containers[i])
plt.title('Count of type feature')
plt.show()

In [9]:
# correlation matrix heat map

plt.figure(figsize=(8,6))
sns.heatmap(data=df.corr(),annot=True)
plt.title('Heatmap of correlation matrix')
plt.show()

In [11]:
# label encoding of the type feature.

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["type"] = label_encoder.fit_transform(df["type"])
df.head()



In [20]:
# dropping the nameOrig and nameDest columns as they are of no use

df.drop(["nameOrig" , "nameDest"] , inplace = True , axis =1)

In [31]:
# from the above plot we can make out that the data is imbalanced as there are more number of non fraudlent transactions.
# since it is a classification problem we use SMOTE over sampling technique.

from imblearn.over_sampling import SMOTE

x , y = df.loc[:, df.columns != 'isFraud'], df['isFraud']

sm = SMOTE(random_state=42,k_neighbors=7)

x_os, y_os = sm.fit_resample(x, y)


In [32]:
# splitting the data into train and test

from sklearn.model_selection import train_test_split

x_train, x_test ,y_train , y_test = train_test_split(x_os,y_os,test_size=0.3,random_state=7)

In [33]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train= sc.fit_transform(x_train)
x_test=  sc.fit_transform(x_test)


In [34]:
#applying logistic regression first

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train , y_train)
pred= lr.predict(x_test)

In [35]:
from sklearn import metrics

print ("Accuracy : ", metrics.accuracy_score(y_test, pred))
