<a href="https://colab.research.google.com/github/yehiayaser/Anomly-Detection/blob/main/Anomly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
credit_data = pd.read_csv('/content/creditcard.csv', nrows=50000) # https://www.kaggle.com/mlg-ulb/creditcardfraud
standardized_data_without_class = StandardScaler().fit_transform(credit_data.loc[:,credit_data.columns!='Class'])
data_50k_new = standardized_data_without_class[0:50000]
data_50k_df = pd.DataFrame(data=data_50k_new)
# Separate features and target variable
X = credit_data.drop(columns=['Class'])
y = credit_data['Class']

In [3]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13949,24754,1.252924,-0.182189,-0.802716,-0.210981,1.916713,3.643624,-0.778711,0.818295,1.706962,...,-0.001336,-0.497088,-1.211285,0.043809,0.964159,0.442030,0.261483,-0.051402,0.005112,23.74
13950,24756,-0.346979,-2.103284,-0.685061,1.961605,-0.401125,0.473632,1.133816,-0.256528,0.893409,...,1.437054,0.359662,-0.316275,-0.864259,-0.279881,0.491802,-0.353996,-0.149931,0.129795,794.20
13951,24759,-6.053652,-5.988723,0.810413,-0.011811,1.308135,-0.590803,-0.725838,-0.234840,1.624646,...,-2.982379,-0.771970,1.474668,3.176363,-0.302410,0.052529,-0.373871,-0.700463,2.508443,60.00
13952,24759,1.169121,-1.284945,0.032717,-0.681670,0.660598,4.412578,-1.913115,1.076592,1.501230,...,-0.265041,-0.557596,-0.882435,-0.041523,0.975445,0.297229,0.550515,0.015029,0.032067,90.00


In [4]:
y

Unnamed: 0,Class
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
13949,0.0
13950,0.0
13951,0.0
13952,0.0


In [6]:
# Drop rows with NaN values
data_50k_df_cleaned = data_50k_df.dropna()

# Train OneClassSVM with RBF kernel
clf_svm_rbf = OneClassSVM(kernel="rbf", degree=3, gamma=0.1, nu=0.01)
y_predict_rbf = clf_svm_rbf.fit_predict(data_50k_df_cleaned)

# Print the number of outliers detected by RBF kernel
print(f"Number of outliers detected (RBF kernel): {np.sum(y_predict_rbf == -1)}")

Number of outliers detected (RBF kernel): 717


In [7]:
# Train OneClassSVM with Sigmoid kernel
clf_svm_sigmoid = OneClassSVM(kernel="sigmoid", degree=3, gamma=0.1, nu=0.01)
y_predict_sigmoid = clf_svm_sigmoid.fit_predict(data_50k_df_cleaned)

# Print the number of outliers detected by Sigmoid kernel
print(f"Number of outliers detected (Sigmoid kernel): {np.sum(y_predict_sigmoid == -1)}")

Number of outliers detected (Sigmoid kernel): 141


In [10]:
from sklearn.metrics import classification_report

# Align the true labels with the cleaned data
y_cleaned = y.loc[data_50k_df_cleaned.index]

# Map true labels to match predicted labels (0 for normal, -1 for outlier)
y_cleaned_mapped = y_cleaned.apply(lambda x: -1 if x == 1 else 0)

# Map predicted labels to match true labels (1 to 0 for normal)
y_predict_rbf_mapped = np.where(y_predict_rbf == 1, 0, y_predict_rbf)
y_predict_sigmoid_mapped = np.where(y_predict_sigmoid == 1, 0, y_predict_sigmoid)


# Generate classification report for RBF kernel
print("Classification Report (RBF kernel):")
print(classification_report(y_cleaned_mapped, y_predict_rbf_mapped, target_names=['Normal', 'Outlier'], zero_division=0))

# Generate classification report for Sigmoid kernel
print("\nClassification Report (Sigmoid kernel):")
print(classification_report(y_cleaned_mapped, y_predict_sigmoid_mapped, target_names=['Normal', 'Outlier'], zero_division=0))

Classification Report (RBF kernel):
              precision    recall  f1-score   support

      Normal       0.02      0.27      0.04        56
     Outlier       1.00      0.95      0.97     13897

    accuracy                           0.95     13953
   macro avg       0.51      0.61      0.51     13953
weighted avg       0.99      0.95      0.97     13953


Classification Report (Sigmoid kernel):
              precision    recall  f1-score   support

      Normal       0.03      0.07      0.04        56
     Outlier       1.00      0.99      0.99     13897

    accuracy                           0.99     13953
   macro avg       0.51      0.53      0.52     13953
weighted avg       0.99      0.99      0.99     13953



In [12]:
# Train OneClassSVM with Linear kernel
clf_svm_linear = OneClassSVM(kernel="linear", gamma="auto", nu=0.01)
y_predict_linear = clf_svm_linear.fit_predict(data_50k_df_cleaned)

# Print the number of outliers detected by Linear kernel
print(f"Number of outliers detected (Linear kernel): {np.sum(y_predict_linear == -1)}")

# Train OneClassSVM with Poly kernel
clf_svm_poly = OneClassSVM(kernel="poly", degree=3, gamma="auto", nu=00.01)
y_predict_poly = clf_svm_poly.fit_predict(data_50k_df_cleaned)

# Print the number of outliers detected by Poly kernel
print(f"Number of outliers detected (Poly kernel): {np.sum(y_predict_poly == -1)}")

# Map predicted labels to match true labels (1 to 0 for normal)
y_predict_linear_mapped = np.where(y_predict_linear == 1, 0, y_predict_linear)
y_predict_poly_mapped = np.where(y_predict_poly == 1, 0, y_predict_poly)

# Generate classification report for Linear kernel
print("\nClassification Report (Linear kernel):")
print(classification_report(y_cleaned_mapped, y_predict_linear_mapped, target_names=['Normal', 'Outlier'], zero_division=0))

# Generate classification report for Poly kernel
print("\nClassification Report (Poly kernel):")
print(classification_report(y_cleaned_mapped, y_predict_poly_mapped, target_names=['Normal', 'Outlier'], zero_division=0))

Number of outliers detected (Linear kernel): 10130
Number of outliers detected (Poly kernel): 475

Classification Report (Linear kernel):
              precision    recall  f1-score   support

      Normal       0.00      0.11      0.00        56
     Outlier       0.99      0.27      0.43     13897

    accuracy                           0.27     13953
   macro avg       0.49      0.19      0.21     13953
weighted avg       0.98      0.27      0.42     13953


Classification Report (Poly kernel):
              precision    recall  f1-score   support

      Normal       0.00      0.02      0.00        56
     Outlier       1.00      0.97      0.98     13897

    accuracy                           0.96     13953
   macro avg       0.50      0.49      0.49     13953
weighted avg       0.99      0.96      0.98     13953



In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Handle potential NaN values in the target variable for the training set
# We will drop rows where y_train is NaN
nan_mask_train = y_train.isna()
X_train_cleaned = X_train[~nan_mask_train]
y_train_cleaned = y_train[~nan_mask_train]

# Handle potential NaN values in the target variable for the testing set
# We will drop rows where y_test is NaN
nan_mask_test = y_test.isna()
X_test_cleaned = X_test[~nan_mask_test]
y_test_cleaned = y_test[~nan_mask_test]


# Train a Support Vector Machine (SVM) classifier
# Using 'linear' kernel for simplicity, can be changed to 'rbf', etc.
# Due to the imbalanced nature of the dataset, we might need to adjust class_weight
svm_classifier = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_classifier.fit(X_train_cleaned, y_train_cleaned)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_cleaned)

# Generate classification report
print("Classification Report (SVM Classifier):")
print(classification_report(y_test_cleaned, y_pred, zero_division=0))

Classification Report (SVM Classifier):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4165
         1.0       0.47      0.76      0.58        21

    accuracy                           0.99      4186
   macro avg       0.73      0.88      0.79      4186
weighted avg       1.00      0.99      1.00      4186

