In [88]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

from sklearn.metrics import classification_report, plot_confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from scipy.stats import f, ttest_ind, chi2

In [2]:
# data at first glance
data = pd.read_csv('BitcoinHeistData.csv')
data.rename(columns={'count':'counted'},inplace=True)

In [3]:
# create binary labels
data.loc[data['label'] == 'white', 'blabel'] = 0
data.loc[data['label'] != 'white', 'blabel'] = 1

In [4]:
# redefine date variable
data['date'] = (np.asarray(data['year'], dtype='datetime64[Y]')-1970)+(np.asarray(data['day'], dtype='timedelta64[D]')-1)

In [36]:
data.tail()

Unnamed: 0,address,year,day,length,weight,counted,looped,neighbors,income,label,blabel,date
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1255809000.0,white,0.0,2018-11-26
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.0,1,0,1,44096990.0,white,0.0,2018-11-26
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.0,6,6,35,2398267000.0,white,0.0,2018-11-26
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.5,1,0,1,178042700.0,white,0.0,2018-11-26
2916696,3LFFBxp15h9KSFtaw55np8eP5fv6kdK17e,2018,330,144,0.073972,6800,0,2,112350000.0,white,0.0,2018-11-26


In [74]:
class RollingBinaryClassification:

    def __init__(self, X, families, train_period = 120, test_period = 30, num_sample = 200):
        self.data = X
        self.familyList = families+['white']
        self.train_period = train_period
        self.test_period = test_period
        self.num_sample = num_sample
        self.varList = ['length','weight','counted','looped','neighbors','income']
        self.random_seed = 42
        self.processed = None
    
    def preprocessing(self):
        self.data = self.data.loc[self.data['label'].isin(self.familyList)]
        self.processed = pd.get_dummies(self.data, columns=['label'])
        
    def undersampling(self, data, method, label_name):
        
        result = None

        if method == 'train':
            train_size = self.train_period * self.num_sample
            if label_name != 'label_white':
                white_train = data.loc[data[label_name] == 0].sample(train_size, random_state=self.random_seed)
                heist_train = data.loc[data[label_name] == 1]
                result = white_train.append(heist_train)
                result = result.sample(frac = 1, random_state=self.random_seed)
            else:
                white_train = data.loc[data[label_name] == 1].sample(train_size, random_state=self.random_seed)
                heist_train = data.loc[data[label_name] == 0]
                result = white_train.append(heist_train)
                result = result.sample(frac = 1, random_state=self.random_seed)

        elif method == 'test':
            test_size = self.test_period * self.num_sample
            if label_name != 'label_white':
                white_test = data.loc[data[label_name] == 0].sample(test_size, random_state=self.random_seed)
                heist_test = data.loc[data[label_name] == 1]
                result = white_test.append(heist_test)
                result = result.sample(frac = 1, random_state=self.random_seed)
            else:
                white_test = data.loc[data[label_name] == 1].sample(test_size, random_state=self.random_seed)
                heist_test = data.loc[data[label_name] == 0]
                result = white_test.append(heist_test)
                result = result.sample(frac = 1, random_state=self.random_seed)
        else:
            print("Invalid input of method. Please choose from 'train' or 'test'.")

        return result

    
    def split_train_test(self, date, label_name):

        start_date = date - np.timedelta64(self.train_period, 'D')
        mask = (self.processed['date'] >= start_date)&(self.processed['date'] < date)
        train = self.processed.loc[mask,]

        if train.shape[0] > self.train_period * self.num_sample:
            train = self.undersampling(data = train, method = 'train', label_name = label_name)
        
        y_train = train.pop(label_name)
        x_train = train[self.varList]

        end_date = date + np.timedelta64(self.test_period, 'D')
        mask = (self.processed['date'] >= date)&(self.processed['date'] < end_date)
        test = self.processed.loc[mask,]

        if test.shape[0] > self.test_period * self.num_sample:
            test = self.undersampling(data = test, method = 'test', label_name = label_name)

        y_test = test.pop(label_name)
        x_test = test[self.varList]

        return x_train, y_train, x_test, y_test


    def fit(self):
        
        #preprocessing
        self.preprocessing()

        dates = np.unique(self.data.date)
        end_date = dates[-1]
        result_list = []
        
        for name in self.familyList:
            print(name)
            
            label = 'label_'+name
            true_name = 'y_true_'+name
            pred_name = 'y_pred_'+name
            
            comp = pd.DataFrame(columns = [true_name, pred_name], index = self.data.index)
            current_date = dates[self.train_period+1]
            i = 0
            
            while current_date < end_date:
                if i % 50 == 0:
                    print(current_date)
                x_train, y_train, x_test, y_test = self.split_train_test(current_date, label)
                y_train = y_train.astype('int')
                y_test = y_test.astype('int')
                
                current_date += np.timedelta64(self.test_period, 'D')
                test_index = y_test.index
                i += 1
                
                #skip training when only 1 class in training set
                if len(np.unique(y_train)) == 1 or min(np.bincount(y_train)) <= 2:
                    continue

                #train the classifier and get the predicted result
                lda = QuadraticDiscriminantAnalysis()
                lda.fit(x_train, y_train)
                y_pred = lda.predict(x_test)
                
                comp.loc[test_index, true_name] = y_test
                comp.loc[test_index, pred_name] = y_pred
            
            result_list.append(comp)
        
        result = pd.concat(result_list)
        result = result.dropna(how = 'all')
        result['Date'] = self.data['date']
        return result

In [75]:
common_ransom_list = ['princetonCerber',
 'princetonLocky',
 'montrealCryptoLocker',
 'montrealCryptXXX',
 'paduaCryptoWall',
 'montrealNoobCrypt',
 'montrealDMALockerv3',
 'montrealDMALocker']

In [76]:
X = data
rolling_lr = RollingBinaryClassification(X, common_ransom_list, train_period=120,test_period=30,num_sample=200)
result1 = rolling_lr.fit()

princetonCerber
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


princetonLocky
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


montrealCryptoLocker
2011-05-02T00:00:00.000000000




2015-06-10T00:00:00.000000000
montrealCryptXXX
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000
paduaCryptoWall
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000
montrealNoobCrypt
2011-05-02T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


2015-06-10T00:00:00.000000000




montrealDMALockerv3
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


montrealDMALocker
2011-05-02T00:00:00.000000000
2015-06-10T00:00:00.000000000




white
2011-05-02T00:00:00.000000000




2015-06-10T00:00:00.000000000




In [77]:
result1

Unnamed: 0,y_true_princetonCerber,y_pred_princetonCerber,y_true_princetonLocky,y_pred_princetonLocky,y_true_montrealCryptoLocker,y_pred_montrealCryptoLocker,y_true_montrealCryptXXX,y_pred_montrealCryptXXX,y_true_paduaCryptoWall,y_pred_paduaCryptoWall,y_true_montrealNoobCrypt,y_pred_montrealNoobCrypt,y_true_montrealDMALockerv3,y_pred_montrealDMALockerv3,y_true_montrealDMALocker,y_pred_montrealDMALocker,y_true_white,y_pred_white,Date
0,1,1,,,,,,,,,,,,,,,,,2017-01-11
2,1,1,,,,,,,,,,,,,,,,,2016-09-02
3,1,1,,,,,,,,,,,,,,,,,2016-11-17
6,1,1,,,,,,,,,,,,,,,,,2016-08-12
7,1,1,,,,,,,,,,,,,,,,,2016-11-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610682,,,,,,,,,,,,,,,,,1,1,2018-01-24
2610685,,,,,,,,,,,,,,,,,1,1,2018-01-24
2610686,,,,,,,,,,,,,,,,,1,1,2018-01-24
2610692,,,,,,,,,,,,,,,,,1,1,2018-01-24


In [78]:
result1.to_csv('multiclass_qda_result.csv')

In [100]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def precision_heist(tp, fp):
    return tp/(tp+fp)
def recall_heist(tp, fn):
    return tp/(tp+fn)
def f1_heist(p, r):
    return 2*p*r/(p+r)
def accur(tp,tn,fp,fn):
    return (tp+tn)/(tp+tn+fp+fn)

In [108]:
for name in common_ransom_list+['white']:
    print(name)
    true_label = 'y_true_'+name
    pred_label = 'y_pred_'+name
    df_temp = result1[[true_label, pred_label]].dropna(how='any')
    
    y_true = df_temp[true_label].astype('int')
    y_pred = df_temp[pred_label].astype('int')
    
    _tp = tp(y_true, y_pred)
    _fp = fp(y_true, y_pred)
    _fn = fn(y_true, y_pred)
    _tn = tn(y_true, y_pred)
    
    acc = accur(_tp, _tn, _fp, _fn)
    prec = precision_heist(_tp,_fp)
    rec = recall_heist(_tp, _fn)
    f1 = f1_heist(prec, rec)
    print(round(acc,2))
    print(round(prec,2))
    print(round(rec,2))
    print(round(f1,2))
    if name == 'white':
        print(_fn)
    

princetonCerber
0.46
0.1
0.96
0.19
princetonLocky
0.32
0.07
0.97
0.13
montrealCryptoLocker
0.52
0.05
0.59
0.09
montrealCryptXXX
0.39
0.04
0.95
0.08
paduaCryptoWall
0.45
0.09
0.79
0.16
montrealNoobCrypt
0.65
0.0
0.69
0.01
montrealDMALockerv3
0.45
0.01
0.82
0.01
montrealDMALocker
0.66
0.0
0.54
0.01
white
0.41
0.95
0.38
0.54
287766


In [103]:
X = data
rolling_lr2 = RollingBinaryClassification(X, common_ransom_list, train_period=60,test_period=15,num_sample=200)
result2 = rolling_lr2.fit()

princetonCerber
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000




2017-04-30T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


princetonLocky
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000
2017-04-30T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


montrealCryptoLocker
2011-03-03T00:00:00.000000000




2013-03-22T00:00:00.000000000




2015-04-11T00:00:00.000000000
2017-04-30T00:00:00.000000000
montrealCryptXXX
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000




2017-04-30T00:00:00.000000000
paduaCryptoWall
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000




2015-04-11T00:00:00.000000000




2017-04-30T00:00:00.000000000
montrealNoobCrypt
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.d

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


2015-04-11T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


2017-04-30T00:00:00.000000000
montrealDMALockerv3
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000
2017-04-30T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


montrealDMALocker
2011-03-03T00:00:00.000000000
2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


2017-04-30T00:00:00.000000000
white
2011-03-03T00:00:00.000000000




2013-03-22T00:00:00.000000000
2015-04-11T00:00:00.000000000
2017-04-30T00:00:00.000000000




In [104]:
result2.to_csv('multiclass_qda_result2.csv')

In [109]:
for name in common_ransom_list+['white']:
    print(name)
    true_label = 'y_true_'+name
    pred_label = 'y_pred_'+name
    df_temp = result2[[true_label, pred_label]].dropna(how='any')
    
    y_true = df_temp[true_label].astype('int')
    y_pred = df_temp[pred_label].astype('int')
    
    _tp = tp(y_true, y_pred)
    _fp = fp(y_true, y_pred)
    _fn = fn(y_true, y_pred)
    _tn = tn(y_true, y_pred)
    
    acc = accur(_tp, _tn, _fp, _fn)
    prec = precision_heist(_tp,_fp)
    rec = recall_heist(_tp, _fn)
    f1 = f1_heist(prec, rec)
    print(round(acc,2))
    print(round(prec,2))
    print(round(rec,2))
    print(round(f1,2))
    if name == 'white':
        print(_fn)
    

princetonCerber
0.47
0.11
0.96
0.21
princetonLocky
0.42
0.1
0.95
0.18
montrealCryptoLocker
0.56
0.03
0.33
0.06
montrealCryptXXX
0.4
0.07
0.93
0.14
paduaCryptoWall
0.5
0.11
0.85
0.2
montrealNoobCrypt
0.61
0.0
0.75
0.01
montrealDMALockerv3
0.54
0.01
0.79
0.01
montrealDMALocker
0.71
0.0
0.45
0.01
white
0.45
0.95
0.43
0.59
255261


In [108]:
repo_full = classification_report(y_true = comparison_matrix['y_true'], y_pred = comparison_matrix['y_pred'])

In [109]:
print(repo_full)

              precision    recall  f1-score   support

           0       0.99      0.40      0.57   1920000
           1       0.03      0.76      0.05     41404

    accuracy                           0.41   1961404
   macro avg       0.51      0.58      0.31   1961404
weighted avg       0.97      0.41      0.56   1961404



In [50]:
model = QuadraticDiscriminantAnalysis()
params = [{'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]}]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=cv, scoring='f1')

In [51]:
X = data[['length','weight','counted','looped','neighbors','income']]
y = data['blabel']
grid_result = grid.fit(X, y)

In [52]:
lr = LogisticRegression(class_weight = {0: 1, 1: 200})
lr.fit(X,y)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 200}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
y_pred = lr.predict(X)
class_repo = classification_report(y_true=y, y_pred=y_pred, zero_division=0)
print(class_repo)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99   2875284
         1.0       0.00      0.00      0.00     41413

    accuracy                           0.99   2916697
   macro avg       0.49      0.50      0.50   2916697
weighted avg       0.97      0.99      0.98   2916697



In [53]:
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.030209 using {'reg_param': 0.5}


In [54]:
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.029975 (0.000117) with: {'reg_param': 0.1}
0.030036 (0.000102) with: {'reg_param': 0.2}
0.030075 (0.000092) with: {'reg_param': 0.3}
0.030110 (0.000112) with: {'reg_param': 0.4}
0.030209 (0.000114) with: {'reg_param': 0.5}
