In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

file_path = r"C:\Users\evilk\Downloads\HTC Intern\Dataset\creditcard.csv"

df = pd.read_csv(file_path)

print("Shape of the dataset:", df.shape)
print(df.head())
print(df.isnull().sum())

df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))
print(df.head())

clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.01), max_features=1.0, random_state=42)
clf.fit(df)
y_pred = clf.predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers with Isolation Forest:", len(df[y_pred == -1]))

clf = LocalOutlierFactor(n_neighbors=20, contamination=float(0.01))
y_pred = clf.fit_predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers with Local Outlier Factor:", len(df[y_pred == -1]))

clf = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.01)
clf.fit(df)
y_pred = clf.predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers with One-class SVM:", len(df[y_pred == -1]))

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

classifiers = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier()]

lr_params = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7]}
rf_params = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7]}
param_grids = [lr_params, dt_params, rf_params]

for i, classifier in enumerate(classifiers):
    clf = GridSearchCV(classifier, param_grids[i], cv=5)
    clf.fit(X_train, y_train)
    print(classifier.__class__.__name__)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    if isinstance(classifier, RandomForestClassifier):
        rf_model = clf.best_estimator_

        y_pred = rf_model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f"Accuracy: {acc}")
        print(f"Precision: {prec}")
        print(f"Recall: {rec}")
        print(f"F1 Score: {f1}")


Shape of the dataset: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010  