# Veri Kümesi Hakkında
Veri kümeleri, Avrupalı ​​kart sahipleri tarafından Eylül 2013'te kredi kartlarıyla yapılan işlemleri içerir. Bu veri kümesi, 284,807 işlemden 492'sinin dolandırıcılık olduğu iki gün içinde gerçekleşen işlemleri sunar. 

Veri kümesi oldukça dengesizdir, pozitif sınıf (dolandırıcılık) tüm işlemlerin %0,172'sini oluşturur.

Yalnızca bir PCA dönüşümünün sonucu olan sayısal girdi değişkenlerini içerir.

Ne yazık ki, gizlilik sorunları nedeniyle orijinal özellikleri ve verilerle ilgili daha fazla arka plan bilgisi sağlayamıyoruz. 

Özellikler V1, V2, ... V28, PCA ile elde edilen temel bileşenlerdir, PCA ile dönüştürülmeyen tek özellikler 'Time' ve 'Amount'dır. 

# 'Time' özelliği, her işlem ile veri kümesindeki ilk işlem arasında geçen saniyeleri içerir. 

**'Amount'** özelliği işlem Tutarı'dır, bu özellik örneğin maliyete duyarlı öğrenme için kullanılabilir. 

**'Class'** özelliği yanıt değişkenidir ve dolandırıcılık durumunda 1, aksi halde 0 değerini alır.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%matplotlib inline
np.random.seed(2)

In [None]:
#ROOT_DIR = "/content/drive/MyDrive/CASGEM-Egitim/Egitim-Part2/Day11-DeepLearning/notebooks/"
ROOT_DIR = "https://media.githubusercontent.com/media/yapay-ogrenme/casgem-eu-project-training-on-data-mining/main/PART2/Day11-DeepLearning/notebooks/"

DATASET_PATH = ROOT_DIR + "datasets/"

In [None]:
data = pd.read_csv(DATASET_PATH+'creditcard.csv')
data

## Data exploration

In [None]:
print("The DataSet has ",str(data.shape[0]),"rows and",data.shape[1],"Columns")

In [None]:
data.head()

#### Checking For Null Values

In [None]:
data.isnull().sum()

#### Let's Check how the data is disperesed!

In [None]:
data.describe().T

In [None]:
fraud_transactions=data.where(data['Class']==1)
regular_transactions=data.where(data['Class']==0)

regular_amount=regular_transactions['Amount']
fraud_amount=fraud_transactions['Amount']

fraud_amount.describe()

### **Sahte İşlemlerin %50'si 10$'dan az !! **


---



In [None]:
fraud_amount.plot(kind='hist')

In [None]:
fraud_transactions.boxplot(column='Amount')

In [None]:
regular_amount.plot(kind='hist')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = data['Amount'].values
time_val = data['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

In [None]:
V = data[[col for col in data.columns if 'V' in col]]

f, ax = plt.subplots(ncols = 2, nrows = 14, figsize=(15,2*len(V.columns)))

for i, c in zip(ax.flatten(), V.columns):
    sns.distplot(V[c], ax = i)

f.tight_layout()

## Çoğu Hileli ve Düzenli işlemler hangi saatte gerçekleşir?

In [None]:
def convert_totime(seconds):
    return datetime.datetime.fromtimestamp(seconds);

timeAnalysis = data[['Time', 'Amount', 'Class']].copy()
timeAnalysis['datetime'] = timeAnalysis.Time.apply(convert_totime)
timeDelta = datetime.datetime.utcnow() - datetime.datetime.now()

# As the max time is 172792 seconds and 172792 / (60*60) is about 48 hrs so we only have data for 2 days so only 
# plotting data against hours make sense
timeAnalysis['hour of the day'] = timeAnalysis.datetime + timeDelta
timeAnalysis['hour of the day'] = timeAnalysis['hour of the day'].dt.hour
timeAnalysisGrouped = timeAnalysis.groupby(['Class', 'hour of the day'])['Amount'].count()

In [None]:
plt.figure(figsize = (10, 6))
validTransactions = timeAnalysisGrouped[0].copy()
validTransactions.name = 'Number of transactions'
validTransactions.plot.bar(title = 'No of legitimate credit card transactions per hour', legend = True)

In [None]:
plt.figure(figsize = (10, 6))
fraudTransactions = timeAnalysisGrouped[1].copy()
fraudTransactions.name = 'Number of transactions'
fraudTransactions.plot.bar(title = 'Number of fraud credit card transactions per hour', legend = True)

#### 2 AM ve 11.00'de gerçekleştirilen dolandırıcılık İşlemlerinin sayısında olağandışı bir artış var. Normal İşlemlerle karşılaştırıldığında Sahte İşlemler, iki tepe noktası göz ardı edilerek dağıtılır.

#### Normal İşlemler, gündüz saatlerinde Yüksek İşlem Sayısı ile beklenen trendi gösteriyor.

# Model Building

In [None]:
from sklearn.preprocessing import StandardScaler
data['normalizedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data = data.drop(['Amount'],axis=1)

In [None]:
data.head()

In [None]:
data = data.drop(['Time'],axis=1)
data.head()

In [None]:
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

## Building a Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train,y_train.values.ravel())

In [None]:
y_pred = random_forest.predict(X_test)
random_forest.score(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt
import itertools

from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
cnf_matrix = confusion_matrix(y_test,y_pred)
plot_confusion_matrix(cnf_matrix,classes=[0,1])

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Deep neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
model = Sequential([
    Dense(units=16, input_dim = 29,activation='relu'),
    Dense(units=24,activation='relu'),
    Dropout(0.5),
    Dense(20,activation='relu'),
    Dense(24,activation='relu'),
    Dense(1,activation='sigmoid'),
])

In [None]:
model.summary()

## Training

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=15,epochs=5)

In [None]:
score = model.evaluate(X_test, y_test)

In [None]:
print(score)

In [None]:
import matplotlib.pyplot as plt
import itertools

from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
y_pred = model.predict(X_test)
y_test = pd.DataFrame(y_test)

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred.round())

In [None]:
print(cnf_matrix)

In [None]:
plot_confusion_matrix(cnf_matrix, classes=[0,1])

In [None]:
from sklearn.metrics import f1_score

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, y_pred.round())
print('The f1 score for the testing data:', f1_test)

## Undersampling

In [None]:
fraud_indices = np.array(data[data.Class == 1].index)
number_records_fraud = len(fraud_indices)
print(number_records_fraud)

In [None]:
normal_indices = data[data.Class == 0].index
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
print(len(random_normal_indices))

In [None]:
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(len(under_sample_indices))

In [None]:
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.iloc[:,under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:,under_sample_data.columns == 'Class']
X_train, X_test, y_train, y_test = train_test_split(X_undersample,y_undersample, test_size=0.3)

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=15,epochs=5)

In [None]:
y_pred = model.predict(X_test)
y_expected = pd.DataFrame(y_test)
cnf_matrix = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cnf_matrix, classes=[0,1])
plt.show()

## SMOTE - oversampling

In [None]:
%%bash
pip install -U imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

X_resample, y_resample = oversample.fit_resample(X,y.values.ravel())
y_resample = pd.DataFrame(y_resample)
X_resample = pd.DataFrame(X_resample)
X_train, X_test, y_train, y_test = train_test_split(X_resample,y_resample,test_size=0.3)
###########################################################################################
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=15,epochs=5)

In [None]:
y_pred = model.predict(X_test)
y_expected = pd.DataFrame(y_test)
cnf_matrix = confusion_matrix(y_expected, y_pred.round())
plot_confusion_matrix(cnf_matrix, classes=[0,1])
plt.show()