In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")

In [3]:
data = pd.read_csv("../input/creditcardfraud/creditcard.csv")
data.head()

**Exploratory Data Analysis**

In [4]:
data.info()

In [5]:
data.describe()

In [6]:
pd.set_option("display.float", "{:.3f}".format)
data.describe()

In [7]:
data.columns

**Here V1 to V28 are the principal component analysis transformed features.The only features that are not transformed are Time,Amount and Class(1:Fraud,0:Not Fraud).
We will analyse data to 
1) The amount of money used in different classes.
2) The time frames during which fraudalent transactions occur.
**

In [8]:
count_classes = pd.value_counts(data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
Labels = ["Non-Fraud","Fraud"]
plt.xticks(range(2), Labels)
plt.xlabel("Class")
plt.ylabel("Count");

In [9]:
data.Class.value_counts()

Finding the number of fraud and valid transactions 

In [10]:
fraud = data[data['Class']==1]
normal = data[data['Class']==0]

Amount used for different transactions 

In [11]:
fraud.Amount.describe()

In [12]:
normal.Amount.describe()

Time frame during which transactions occur 

In [13]:
fraud.Time.describe()

In [14]:
normal.Time.describe()

In [15]:
plt.figure(figsize=(16,12))
plt.subplot(2,2,1)
plt.title('Time Distribution in secs')

sns.distplot(data['Time'], color='blue');

#plot the amount feature
plt.subplot(2,2,2)
plt.title('Distribution of Amount')
sns.distplot(data['Amount'],color='red');

Plot of transactions according to time

In [16]:

plt.figure(figsize=(16, 12))

plt.subplot(2,2,1)
data[data.Class == 1].Time.hist(bins=45, color='blue', alpha=0.6, label="Fraud Transaction")
plt.legend()

plt.subplot(2, 2, 2)
data[data.Class == 0].Time.hist(bins=45, color='blue', alpha=0.6, label="Non Fraud Transaction")
plt.legend()

Heatmap to find correlations

In [17]:
plt.figure(figsize=(12,12))
sns.heatmap(data=data.corr(), cmap="YlGnBu")
plt.show();

The correlation matrix shows that none of the components V1 to V28 have any correlation to each other.

Scaling of Columns Time and Amount

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

X = data.drop('Class', axis=1)
y = data.Class

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

w_p = y_train.value_counts()[0] / len(y_train)
w_n = y_train.value_counts()[1] / len(y_train)

print(f"Fraudulant transaction weight: {w_n}")
print(f"Non-Fraudulant transaction weight: {w_p}")

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

def print_score(label, prediction, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Train Result:\n")
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        #print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        #print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, prediction)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Test Result:\n")        
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        #print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        #print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(label, prediction)}\n") 

**USING RANDOMFOREST CLASSIFIER **

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, oob_score=False)
rf_clf.fit(X_train, y_train)

y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

scores_dict['Random Forest'] = {
        'Train': f1_score(y_train,y_train_pred),
        'Test': f1_score(y_test, y_test_pred),
}