# Importing Libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Exploratory Data Analysis


In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/CryptocoinsHeistData.csv"
df = pd.read_csv(file_path)

In [None]:
df.shape

(2916697, 10)

In [None]:
df.head(6)

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky
5,112AmFATxzhuSpvtz1hfpa3Zrw3BG276pc,2016,96,144,0.084614,2821,0,1,50000000.0,princetonLocky


In [None]:
df.describe()

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income
count,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0
mean,2014.475,181.4572,45.00859,0.5455192,721.6446,238.5067,2.206516,4464889000.0
std,2.257398,104.0118,58.98236,3.674255,1689.676,966.3217,17.91877,162686000000.0
min,2011.0,1.0,0.0,3.606469e-94,1.0,0.0,1.0,30000000.0
25%,2013.0,92.0,2.0,0.02148438,1.0,0.0,1.0,74285590.0
50%,2014.0,181.0,8.0,0.25,1.0,0.0,2.0,199998500.0
75%,2016.0,271.0,108.0,0.8819482,56.0,0.0,2.0,994000000.0
max,2018.0,365.0,144.0,1943.749,14497.0,14496.0,12920.0,49964400000000.0


In [None]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
white,2875284
paduaCryptoWall,12390
montrealCryptoLocker,9315
princetonCerber,9223
princetonLocky,6625
montrealCryptXXX,2419
montrealNoobCrypt,483
montrealDMALockerv3,354
montrealDMALocker,251
montrealSamSam,62


So only a small fraction of transactions are fraudulent. Our anomaly detection can work well.


In [None]:
categorical_cols = df.select_dtypes(include="object").columns

In [None]:
numerical_cols = df.select_dtypes(include="number").columns

In [None]:
df.isnull().sum()

Unnamed: 0,0
address,0
year,0
day,0
length,0
weight,0
count,0
looped,0
neighbors,0
income,0
label,0


Woah! **No null values** :)


In [None]:
df["address"].nunique()

2631095

In [None]:
old_df = df.copy()

# Feature Engineering


In [None]:
df.drop(columns=["address", "year", "day"], axis=1, inplace=True)

In [None]:
for col in df.columns[:-1]:
    df[col] = df[col]

In [None]:
X = df.drop(columns=["label"])
y = df["label"]

In [None]:
X.columns

Index(['length', 'weight', 'count', 'looped', 'neighbors', 'income'], dtype='object')

In [None]:
df.head()

Unnamed: 0,length,weight,count,looped,neighbors,income,label
0,18,0.008333,1,0,2,100050000.0,princetonCerber
1,44,0.000244,1,0,1,100000000.0,princetonLocky
2,0,1.0,1,0,2,200000000.0,princetonCerber
3,72,0.003906,1,0,2,71200000.0,princetonCerber
4,144,0.072848,456,0,1,200000000.0,princetonLocky


## Number of Instances


In [None]:
new_df = pd.DataFrame()
grouped = df.groupby("label")

In [None]:
new_df["num_of_instances"] = grouped.size()

In [None]:
new_df

Unnamed: 0_level_0,num_of_instances
label,Unnamed: 1_level_1
montrealAPT,11
montrealComradeCircle,1
montrealCryptConsole,7
montrealCryptXXX,2419
montrealCryptoLocker,9315
montrealCryptoTorLocker2015,55
montrealDMALocker,251
montrealDMALockerv3,354
montrealEDA2,6
montrealFlyper,9


## Average


In [None]:
df.columns

Index(['length', 'weight', 'count', 'looped', 'neighbors', 'income', 'label'], dtype='object')

In [None]:
for col in X.columns:
    new_df[f"{col}_avg"] = grouped[col].mean()

In [None]:
new_df.shape

(29, 7)

In [None]:
new_df.head()

Unnamed: 0_level_0,num_of_instances,length_avg,weight_avg,count_avg,looped_avg,neighbors_avg,income_avg
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
montrealAPT,11,67.636364,0.707728,2047.0,734.090909,2.545455,371987300.0
montrealComradeCircle,1,144.0,0.051214,1241.0,0.0,2.0,203320000.0
montrealCryptConsole,7,43.428571,0.593306,831.714286,0.0,2.0,45463340.0
montrealCryptXXX,2419,47.447706,0.367505,791.848284,61.022323,2.011988,135534300.0
montrealCryptoLocker,9315,30.674396,0.888878,308.328824,100.981535,2.885346,1840825000.0


## Standard Deviation


In [None]:
for col in X.columns:
    new_df[f"{col}_std"] = grouped[col].agg(np.std).fillna(0)

  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)


## Minimum


In [None]:
for col in X.columns:
    new_df[f"{col}_min"] = grouped[col].min()

## Maximum


In [None]:
for col in X.columns:
    new_df[f"{col}_max"] = grouped[col].max()

In [None]:
new_df = new_df.reset_index()
new_df.head()

Unnamed: 0,label,num_of_instances,length_avg,weight_avg,count_avg,looped_avg,neighbors_avg,income_avg,length_std,weight_std,...,count_min,looped_min,neighbors_min,income_min,length_max,weight_max,count_max,looped_max,neighbors_max,income_max
0,montrealAPT,11,67.636364,0.707728,2047.0,734.090909,2.545455,371987300.0,73.145434,0.600596,...,1,0,1,57142857.0,144,1.666667,8076,8073,6,1088599000.0
1,montrealComradeCircle,1,144.0,0.051214,1241.0,0.0,2.0,203320000.0,0.0,0.0,...,1241,0,2,203320001.0,144,0.051214,1241,0,2,203320000.0
2,montrealCryptConsole,7,43.428571,0.593306,831.714286,0.0,2.0,45463340.0,68.747987,0.410909,...,1,0,2,30000000.0,144,1.0,3191,0,2,50300000.0
3,montrealCryptXXX,2419,47.447706,0.367505,791.848284,61.022323,2.011988,135534300.0,58.187904,0.434143,...,1,0,1,30000000.0,144,3.458951,9262,8489,15,1080000000.0
4,montrealCryptoLocker,9315,30.674396,0.888878,308.328824,100.981535,2.885346,1840825000.0,50.731602,1.555608,...,1,0,1,30000000.0,144,31.108593,6423,6418,79,445000000000.0


# Data Preprocessing


## Undersampling


We undersample the data to **200000 samples** by keeping the minority samples (ransomware) and undersampling the majority (white transactions).


In [None]:
black_rows = df[df["label"] != "white"]

In [None]:
white_sample = df[df["label"] == "white"].sample(n=158587, random_state=42)

In [None]:
sampled_df = pd.concat([black_rows, white_sample]).sample(frac=1, random_state=50)
sampled_df.head()

Unnamed: 0,length,weight,count,looped,neighbors,income,label
526724,14,0.003906,1,0,2,3216000000.0,white
2825235,144,0.19697,1805,0,2,55783520.0,white
7558,14,0.015625,1,0,2,635531100.0,montrealCryptoLocker
1141393,6,1.0,1,0,2,292510000.0,white
1484054,142,0.065711,2568,0,2,136000000.0,white


## Label Encoding


We do label encoding of:

> - White label: 0
> - Ransomware: 1


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

sampled_df["label"] = label_encoder.fit_transform(sampled_df["label"])

sampled_df["label"] = sampled_df["label"].apply(
    lambda x: 0 if x == label_encoder.transform(["white"])[0] else 1
)

sampled_df["label"]

Unnamed: 0,label
526724,0
2825235,0
7558,1
1141393,0
1484054,0
...,...
1363905,0
1934713,0
2481475,0
2039010,0


In [None]:
sampled_df

Unnamed: 0,length,weight,count,looped,neighbors,income,label
526724,14,0.003906,1,0,2,3.216000e+09,0
2825235,144,0.196970,1805,0,2,5.578352e+07,0
7558,14,0.015625,1,0,2,6.355311e+08,1
1141393,6,1.000000,1,0,2,2.925100e+08,0
1484054,142,0.065711,2568,0,2,1.360000e+08,0
...,...,...,...,...,...,...,...
1363905,144,0.019912,1974,1974,2,1.368772e+09,0
1934713,6,0.062500,1,0,2,1.000000e+08,0
2481475,64,0.000008,1,0,2,1.415672e+10,0
2039010,14,0.195312,13,0,1,1.000000e+08,0


## Z-score Anomaly Filteration


We calculate the **Z-score** and filter out the **anomalies** as per a threshold.


In [None]:
z_score_df = pd.DataFrame()
for col in sampled_df.columns:
    z_score_df[f"{col}_z_score"] = (
        sampled_df[col] - sampled_df[col].mean()
    ) / sampled_df[col].std()
z_score_df.head()

Unnamed: 0,length_z_score,weight_z_score,count_z_score,looped_z_score,neighbors_z_score,income_z_score,label_z_score
526724,-0.513441,-0.114174,-0.424426,-0.234473,-0.009561,-0.00492,-0.511015
2825235,1.695481,-0.075626,0.677535,-0.234473,-0.009561,-0.028981,-0.511015
7558,-0.513441,-0.111834,-0.424426,-0.234473,-0.009561,-0.024567,1.956881
1141393,-0.649374,0.084711,-0.424426,-0.234473,-0.009561,-0.027178,-0.511015
1484054,1.661498,-0.101834,1.143608,-0.234473,-0.009561,-0.02837,-0.511015


In [None]:
import pandas as pd

filtered_df = z_score_df[
    z_score_df.apply(lambda row: all(-3 <= val <= 3 for val in row), axis=1)
]

filtered_df.head()

Unnamed: 0,length_z_score,weight_z_score,count_z_score,looped_z_score,neighbors_z_score,income_z_score,label_z_score
526724,-0.513441,-0.114174,-0.424426,-0.234473,-0.009561,-0.00492,-0.511015
2825235,1.695481,-0.075626,0.677535,-0.234473,-0.009561,-0.028981,-0.511015
7558,-0.513441,-0.111834,-0.424426,-0.234473,-0.009561,-0.024567,1.956881
1141393,-0.649374,0.084711,-0.424426,-0.234473,-0.009561,-0.027178,-0.511015
1484054,1.661498,-0.101834,1.143608,-0.234473,-0.009561,-0.02837,-0.511015


## Train Test Split


> Training data: 66%
> Test data: 33%


In [None]:
X = sampled_df.drop("label", axis=1)
y = (sampled_df["label"] > 0).astype("int")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.33, shuffle=True
)

## Standardization


We perform **normalization** on the train data and then scale test data accordingly.


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
import joblib

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

# Training


We use the **Random Forest Classifier** to train on our dataset.
**Random search** is performed to optimize the hyperparameters of the classifier.


**This might take few minutes to run depending upon the Laptop performance.**


In [None]:
import numpy as np
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier()


param_space = {
    "n_estimators": randint(100, 1000),
    "max_depth": randint(1, 20),
    "max_features": uniform(0.1, 0.9),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10),
    "bootstrap": [True, False],
}


random_search = RandomizedSearchCV(
    model,
    param_distributions=param_space,
    n_iter=10,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42,
)


random_search.fit(X_train_scaled, y_train)


print("Best parameters found: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'colsample_bytree': 0.7456880086910325, 'learning_rate': 0.2223825852053257, 'max_depth': 4, 'n_estimators': 364, 'subsample': 0.6291358772643185}
Best score:  0.8394402985074627


# Evaluation


We use the following metrics for evaluation:

> - accuracy
> - precision
> - recall
> - f1 score


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
joblib.dump(best_model, "model.pkl")

['model.pkl']

# Testing


In [None]:
"""
# White Transaction
raw_data = {
    'length': [2],
    'weight': [0.25],
    'count': [1],
    'looped': [0],
    'neighbors': [2],
    'income': [87411400000]
}

"""

# Fraud Transaction
raw_data = {
    "length": [144],
    "weight": [0.099877597],
    "count": [8120],
    "looped": [0],
    "neighbors": [1],
    "income": [50057953],
}

new_data = pd.DataFrame(raw_data)

print("New Transaction Data:\n", new_data)

New Transaction Data:
    length    weight  count  looped  neighbors    income
0     144  0.099878   8120       0          1  50057953


In [65]:
new_data_scaled = scaler.transform(new_data)

new_data_scaled = pd.DataFrame(new_data_scaled, columns=new_data.columns)

print("Standardized Data:\n", new_data_scaled)

Standardized Data:
      length    weight     count    looped  neighbors    income
0  1.692901 -0.082828  4.535015 -0.235459  -0.037667 -0.041899


In [66]:
prediction = best_model.predict(new_data_scaled)

print("Prediction (0: Non-Fraud, 1: Fraud):", prediction)

Prediction (0: Non-Fraud, 1: Fraud): [1]
