In [1]:
import pandas as pd
import numpy as np


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [3]:
df = pd.read_csv('creditcard.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'creditcard.csv'

In [None]:
df.head() 
df.info()

In [None]:
# datanın totaldeki dağılımı
df.Class.value_counts()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

ratio_ones = df.Class.mean()
ratio_zeros = (len(df) - df.Class.sum())/len(df)

plt.bar('Regular transactions', height=ratio_zeros, label='Not Fraud')
plt.bar('Fraud transactions', height=ratio_ones, label='Fraud')
plt.xlabel("Target")
plt.ylabel('Percentage of total')
plt.text(-0.10, 0.5, "{0:.3f} %".format(ratio_zeros*100), fontsize=12)
plt.text(0.95, 0.1, "{0:.3f} %".format(ratio_ones*100), fontsize=12)
plt.legend()
plt.tight_layout()

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Dolandırıcılık ve normal işlemlerin oranlarını hesapla
ratios = [df.Class.mean(), 1 - df.Class.mean()]
labels = ['Fraud', 'Not Fraud']
colors = ['red', 'blue']
# Pasta grafiğini oluştur
plt.pie(ratios, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
# Orta daireyi kapat
plt.axis('equal')
# Başlığı ekle
plt.title('Transaction Type Distribution')
# Grafiği göster
plt.show()

In [None]:
X = df.drop('Class',axis = 1)
y = df['Class']

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(10,10))
def plot_similarity(labels, features, rotation):
    corr = features.corr()
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlGnBu")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Variables Similarity")

plot_similarity(df.columns, df, 90)

In [None]:
df.Time.hist(bins=200)

In [None]:
transaction_amount_threshold = 1000
amount_flag = ((df.Amount > transaction_amount_threshold).astype(int))
amount_flag.value_counts()

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(amount_flag, df.Class)#confusion matrix olurşuturudk

In [None]:
sns.heatmap(cm, annot=True, yticklabels=['< ' + str(transaction_amount_threshold),
                                        '> ' + str(transaction_amount_threshold)], 
                            xticklabels=['Not Fraud', 'Fraud'], fmt='g')
yl = plt.ylabel("Amount")
xl = plt.xlabel("Class")
plt.ylim([0,2])
plt.tight_layout()

## Cross Validation with KFOLD and Hyperparameter Tuning


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)



In [None]:
y_train.value_counts()

In [None]:
from collections import Counter
Counter(y_train)

## Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
rs = RandomOverSampler(sampling_strategy=0.75)
X_train_ns, y_train_ns = rs.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Sınıf dağılımlarını hesapla
total_samples = len(y_train_ns)
fraud_samples = np.sum(y_train_ns == 1)
non_fraud_samples = np.sum(y_train_ns == 0)

# Yüzde olarak ifade et
fraud_percentage = (fraud_samples / total_samples) * 100
non_fraud_percentage = (non_fraud_samples / total_samples) * 100

# Çubuk grafik oluştur
plt.figure(figsize=(8, 6))
plt.bar(['Fraud', 'Non-Fraud'], [fraud_percentage, non_fraud_percentage], color=['red', 'blue'])
plt.ylabel('Percentage of Samples')
plt.title('Class Distribution in Training Data')
plt.show()

In [None]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))


In [None]:
import matplotlib.pyplot as plt

# Original dataset
plt.figure(figsize=(10, 6))
plt.scatter(X_train[y_train == 0].iloc[:, 0], X_train[y_train == 0].iloc[:, 1], label="Class #0", alpha=0.5, linewidth=0.15)
plt.scatter(X_train[y_train == 1].iloc[:, 0], X_train[y_train == 1].iloc[:, 1], label="Class #1", alpha=0.5, linewidth=0.15, c='r')
plt.title('Original dataset')
plt.legend()
plt.show()

# Apply SMOTETomek
from imblearn.combine import SMOTETomek
sm = SMOTETomek(sampling_strategy=1)
X_train_ns, y_train_ns = sm.fit_resample(X_train, y_train)

# Resampled dataset
plt.figure(figsize=(10, 6))
plt.scatter(X_train_ns[y_train_ns == 0].iloc[:, 0], X_train_ns[y_train_ns == 0].iloc[:, 1], label="Class #0", alpha=0.5, linewidth=0.15)
plt.scatter(X_train_ns[y_train_ns == 1].iloc[:, 0], X_train_ns[y_train_ns == 1].iloc[:, 1], label="Class #1", alpha=0.5, linewidth=0.15, c='r')
plt.title('Resampled dataset')
plt.legend()
plt.show()