In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [4]:
CSV_PATH = os.path.join(os.getcwd(), '..\\final.csv')
ug = pd.read_csv(CSV_PATH)
ug.head()

Unnamed: 0,ID,.text:,.Pav:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,.tls:,...,esi,eax,ebx,ecx,edi,ebp,esp,eip,Class,size
0,00000fa1585e99fcb5e8728b96f173ff61b08fc152e2f5...,55216,0,0,53660,0,0,0,26554,0,...,12360,28050,7434,9660,11738,10124,4018,0,SmokeLoader,4.97094
1,000285eadc4e6a68f32343409159c1b44aed8a0bc1c0cd...,108948,0,0,35594,0,33524,0,17770,0,...,17483,43039,9501,16834,13807,16826,8436,0,Loki,6.326924
2,00037a0cc29f3c99e88aeb57af189e291c6fe38b8b2527...,311819,0,0,0,0,0,0,64656,0,...,30109,84735,16436,15685,44207,11887,20710,0,njrat,13.3333
3,0004033aedd01d2928c1c31abb57633f1c493a213eae73...,800645,0,0,0,0,0,0,19250,0,...,78108,202422,43124,46524,74276,29626,19958,0,Loki,30.803483
4,000410685bd62172ae00cafa761c1420f219d323deade1...,106442,0,0,52749,0,0,0,22424,0,...,10939,37113,6741,17573,11825,24366,8272,0,Loki,6.98138


In [9]:
ug.shape

(31324, 50)

In [10]:
ug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31324 entries, 0 to 31323
Data columns (total 50 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       31324 non-null  object 
 1   .text:   31324 non-null  int64  
 2   .Pav:    31324 non-null  int64  
 3   .idata:  31324 non-null  int64  
 4   .data:   31324 non-null  int64  
 5   .bss:    31324 non-null  int64  
 6   .rdata:  31324 non-null  int64  
 7   .edata:  31324 non-null  int64  
 8   .rsrc:   31324 non-null  int64  
 9   .tls:    31324 non-null  int64  
 10  .reloc:  31324 non-null  int64  
 11  .BSS:    31324 non-null  int64  
 12  .CODE    31324 non-null  int64  
 13  jmp      31324 non-null  int64  
 14  mov      31324 non-null  int64  
 15  retf     31324 non-null  int64  
 16  push     31324 non-null  int64  
 17  pop      31324 non-null  int64  
 18  xor      31324 non-null  int64  
 19  retn     31324 non-null  int64  
 20  nop      31324 non-null  int64  
 21  sub      313

The columns having all non-null values will be dropped.

In [11]:
null_op = ug.columns[~ug.any()].tolist()
null_op

['.Pav:', '.BSS:', '.CODE', 'retn', 'jnb', 'rtn', 'eip']

In [12]:
ug.drop(null_op, axis=1, inplace = True)
ug.head()

Unnamed: 0,ID,.text:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,.tls:,.reloc:,...,edx,esi,eax,ebx,ecx,edi,ebp,esp,Class,size
0,00000fa1585e99fcb5e8728b96f173ff61b08fc152e2f5...,55216,0,53660,0,0,0,26554,0,0,...,6114,12360,28050,7434,9660,11738,10124,4018,SmokeLoader,4.97094
1,000285eadc4e6a68f32343409159c1b44aed8a0bc1c0cd...,108948,0,35594,0,33524,0,17770,0,8190,...,8806,17483,43039,9501,16834,13807,16826,8436,Loki,6.326924
2,00037a0cc29f3c99e88aeb57af189e291c6fe38b8b2527...,311819,0,0,0,0,0,64656,0,7,...,14566,30109,84735,16436,15685,44207,11887,20710,njrat,13.3333
3,0004033aedd01d2928c1c31abb57633f1c493a213eae73...,800645,0,0,0,0,0,19250,0,14,...,59048,78108,202422,43124,46524,74276,29626,19958,Loki,30.803483
4,000410685bd62172ae00cafa761c1420f219d323deade1...,106442,0,52749,0,0,0,22424,0,5326,...,15134,10939,37113,6741,17573,11825,24366,8272,Loki,6.98138


In [13]:
ug.isna().sum()

ID         0
.text:     0
.idata:    0
.data:     0
.bss:      0
.rdata:    0
.edata:    0
.rsrc:     0
.tls:      0
.reloc:    0
jmp        0
mov        0
retf       0
push       0
pop        0
xor        0
nop        0
sub        0
inc        0
dec        0
add        0
imul       0
xchg       0
or         0
shr        0
cmp        0
call       0
shl        0
ror        0
rol        0
jz         0
lea        0
movzx      0
edx        0
esi        0
eax        0
ebx        0
ecx        0
edi        0
ebp        0
esp        0
Class      0
size       0
dtype: int64

No more null values!

In [14]:
ug.describe()

Unnamed: 0,.text:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,.tls:,.reloc:,jmp,...,movzx,edx,esi,eax,ebx,ecx,edi,ebp,esp,size
count,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,...,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0,31324.0
mean,82409.42,108.893085,13908.88,0.02787,1883.237,3.316754,24552.17,0.12674,559.781286,1654.0423,...,82.968395,7530.171498,10326.16,24697.73,6886.316147,7935.006289,10468.68,6364.161,4030.785947,4.390388
std,210054.0,688.494662,70782.23,1.721417,21386.33,451.927348,159037.6,1.433483,3119.936228,3907.199323,...,350.465587,26664.000166,25297.78,56507.07,18359.181219,18721.528033,25979.99,15914.99,11536.918679,9.885021
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,20.0,49.0,5.0,7.0,14.0,6.0,1.0,0.008611
50%,16559.0,0.0,0.0,0.0,0.0,0.0,595.0,0.0,0.0,350.0,...,2.0,1737.0,2394.0,6958.5,1582.5,1746.0,2103.0,1544.0,657.0,1.071865
75%,63535.5,0.0,4561.0,0.0,0.0,0.0,15350.25,0.0,6.0,2170.0,...,64.0,7752.5,12860.5,29502.75,7954.5,10934.0,12635.75,9371.0,5090.0,5.578336
max,12547180.0,14934.0,3507668.0,273.0,2095617.0,77470.0,10842560.0,132.0,178516.0,168243.0,...,22266.0,843771.0,1121213.0,1977632.0,982545.0,821239.0,1247508.0,1155001.0,627893.0,456.585502


The column 'ID' is not in use, hence it will also be dropped.

In [20]:
ug.drop('ID', axis = 1, inplace = True)

In [21]:
X = ug.drop("Class", axis = 1)
y = ug["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 36)

In [84]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
pd.DataFrame(X_train_scaled, columns=X_train.columns) 

Unnamed: 0,.text:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,.tls:,.reloc:,jmp,...,movzx,edx,esi,eax,ebx,ecx,edi,ebp,esp,size
0,-0.403635,-0.157240,-0.178618,-0.023966,-0.073876,-0.007865,-0.055563,-0.083233,-0.181270,-0.374169,...,-0.236208,-0.244672,-0.351344,-0.388618,-0.309821,-0.372958,-0.348847,-0.335182,-0.280375,-0.385084
1,-0.407747,11.660116,-0.078966,-0.023966,-0.100860,-0.007865,0.607450,-0.083233,7.655949,-0.098207,...,-0.230540,0.002019,0.859009,0.020936,0.117841,0.035900,0.114822,-0.026474,0.098276,0.124810
2,-0.407747,-0.157240,-0.187165,-0.023966,-0.101990,-0.007865,-0.151444,-0.083233,-0.181270,-0.428537,...,-0.236208,-0.285512,-0.407523,-0.441188,-0.370779,-0.426315,-0.406695,-0.392133,-0.345138,-0.448785
3,0.929803,-0.157240,-0.187165,-0.023966,-0.101990,-0.007865,-0.097091,-0.083233,-0.179677,0.341375,...,-0.213536,0.461029,0.493611,0.462263,0.448365,0.468792,0.570089,0.250030,0.424896,0.564298
4,-0.407747,-0.157240,-0.187165,-0.023966,-0.101990,-0.007865,-0.151444,-0.083233,-0.181270,-0.428537,...,-0.236208,-0.285512,-0.407523,-0.441188,-0.370779,-0.426315,-0.406695,-0.392133,-0.345138,-0.448785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25054,-0.303653,-0.157240,0.469780,-0.023966,-0.101990,-0.007865,-0.126838,-0.083233,-0.181270,-0.129642,...,-0.026496,-0.141408,-0.126823,-0.197388,-0.124657,-0.140355,-0.139578,-0.091919,-0.113076,-0.172210
25055,-0.376952,-0.157240,-0.134409,-0.023966,0.119420,-0.007865,-0.146774,-0.083233,-0.172667,-0.390402,...,-0.148356,-0.276712,-0.405567,-0.431688,-0.363479,-0.416861,-0.405269,-0.390177,-0.340894,-0.403831
25056,-0.407747,-0.157240,-0.187165,-0.023966,-0.101990,-0.007865,-0.151444,-0.083233,-0.181270,-0.428537,...,-0.236208,-0.285512,-0.407523,-0.441188,-0.370779,-0.426315,-0.406695,-0.392133,-0.345138,-0.448785
25057,-0.035324,-0.157240,0.513180,-0.023966,-0.101990,-0.007865,0.378123,-0.083233,0.777830,0.488245,...,0.506286,0.066775,0.285717,0.357353,0.217803,0.396528,0.261040,0.536189,0.350371,0.357780


In [26]:
ug_vc = ug['Class'].value_counts()
ug_vc

Loki              8686
SnakeKeyLogger    7611
SmokeLoader       7579
njrat             3763
Amadey            2703
non-malicious      982
Name: Class, dtype: int64

In [27]:
best = SelectKBest(score_func = f_classif, k = 12)
best.fit(X_train,y_train)

SelectKBest(k=12)

In [28]:
best_cols = best.get_support(indices=True)
best_cols

array([ 0,  1,  2,  4,  7, 17, 21, 25, 30, 31, 32, 34], dtype=int64)

In [29]:
scores = best.scores_
pvalues = best.pvalues_
cols = X.columns
best_sorted = {}
for idx in range(X.shape[1]):
    best_sorted[cols[idx]] = scores[idx]

Naive Bayes (OVR)  
Logistic Regression  
Decision tree  
RF  
SVM  

In [85]:
def get_results(model, X_train, X_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("\t"*2,"*"*8, "RESULTS FOR TRAIN SET", "*"*8)
    print(classification_report(y_train, y_pred_train))
    print("\n")
    print("\t"*2, "*"*8, "RESULTS FOR TEST SET", "*"*8)
    print(classification_report(y_test, y_pred_test))

### Base Model - Navie Bayes 

In [31]:
clf_nb = MultinomialNB()

In [55]:
clf_nb.fit(X_train, y_train)

MultinomialNB()

In [86]:
get_results(clf_nb, X_train, X_test)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       0.61      0.02      0.04      2175
          Loki       0.18      0.18      0.18      6917
   SmokeLoader       0.63      0.51      0.57      6053
SnakeKeyLogger       0.28      0.39      0.33      6101
         njrat       0.16      0.13      0.14      3045
 non-malicious       0.26      0.70      0.38       768

      accuracy                           0.31     25059
     macro avg       0.35      0.32      0.27     25059
  weighted avg       0.35      0.31      0.30     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.70      0.01      0.03       528
          Loki       0.19      0.20      0.19      1769
   SmokeLoader       0.63      0.49      0.55      1526
SnakeKeyLogger       0.29      0.40      0.34      1510
         njrat       0.17      0.15      0.16       718
 non-maliciou

### Logistic Regression (OVR)

In [88]:
logre = LogisticRegression( max_iter=2000)

In [89]:
clf_OVR_logre = OneVsRestClassifier(logre)

In [90]:
clf_OVR_logre.fit(X_train_scaled, y_train)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=2000))

In [91]:
get_results(clf_OVR_logre, X_train_scaled, X_test_scaled)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       0.68      0.03      0.06      2175
          Loki       0.37      0.79      0.51      6917
   SmokeLoader       0.62      0.58      0.60      6053
SnakeKeyLogger       0.60      0.40      0.48      6101
         njrat       0.55      0.07      0.13      3045
 non-malicious       0.92      0.41      0.57       768

      accuracy                           0.48     25059
     macro avg       0.63      0.38      0.39     25059
  weighted avg       0.55      0.48      0.44     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.48      0.03      0.05       528
          Loki       0.37      0.77      0.50      1769
   SmokeLoader       0.63      0.56      0.59      1526
SnakeKeyLogger       0.59      0.40      0.48      1510
         njrat       0.45      0.07      0.12       718
 non-maliciou

### Decision Tree

In [92]:
clf_decte = DecisionTreeClassifier(random_state=0)

In [97]:
clf_decte.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [98]:
get_results(clf_decte, X_train, X_test)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       1.00      1.00      1.00      2175
          Loki       1.00      0.88      0.93      6917
   SmokeLoader       1.00      0.61      0.76      6053
SnakeKeyLogger       0.65      1.00      0.79      6101
         njrat       1.00      0.98      0.99      3045
 non-malicious       1.00      0.98      0.99       768

      accuracy                           0.87     25059
     macro avg       0.94      0.91      0.91     25059
  weighted avg       0.91      0.87      0.87     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.87      0.89      0.88       528
          Loki       0.84      0.74      0.78      1769
   SmokeLoader       0.91      0.54      0.68      1526
SnakeKeyLogger       0.55      0.85      0.67      1510
         njrat       0.86      0.83      0.84       718
 non-maliciou

### Random Forest

In [105]:
clf_ranfo = RandomForestClassifier(max_depth=40, random_state=0)

In [106]:
clf_ranfo.fit(X_train, y_train)

RandomForestClassifier(max_depth=40, random_state=0)

In [107]:
get_results(clf_ranfo, X_train, X_test)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       1.00      1.00      1.00      2175
          Loki       1.00      0.88      0.93      6917
   SmokeLoader       1.00      0.61      0.76      6053
SnakeKeyLogger       0.65      1.00      0.79      6101
         njrat       1.00      0.98      0.99      3045
 non-malicious       1.00      0.98      0.99       768

      accuracy                           0.87     25059
     macro avg       0.94      0.91      0.91     25059
  weighted avg       0.91      0.87      0.87     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.95      0.90      0.92       528
          Loki       0.91      0.75      0.82      1769
   SmokeLoader       0.94      0.56      0.70      1526
SnakeKeyLogger       0.57      0.94      0.71      1510
         njrat       0.93      0.87      0.90       718
 non-maliciou

### SVM

In [109]:
clf_svm = SVC(gamma='auto')

In [110]:
clf_svm.fit(X_train_scaled, y_train)

SVC(gamma='auto')

In [111]:
get_results(clf_svm, X_train_scaled, X_test_scaled)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       0.80      0.80      0.80      2175
          Loki       0.39      0.84      0.54      6917
   SmokeLoader       0.89      0.51      0.65      6053
SnakeKeyLogger       0.67      0.38      0.48      6101
         njrat       0.75      0.18      0.29      3045
 non-malicious       0.94      0.59      0.73       768

      accuracy                           0.56     25059
     macro avg       0.74      0.55      0.58     25059
  weighted avg       0.68      0.56      0.55     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.78      0.80      0.79       528
          Loki       0.40      0.83      0.54      1769
   SmokeLoader       0.89      0.48      0.62      1526
SnakeKeyLogger       0.65      0.38      0.48      1510
         njrat       0.65      0.17      0.26       718
 non-maliciou

### ADABoost

In [145]:
clf_decte_2 = DecisionTreeClassifier(random_state=0, max_depth=15, min_samples_split=2, min_samples_leaf=1, max_features=None)

In [146]:
clf_adab = AdaBoostClassifier(clf_decte_2, n_estimators=8, learning_rate=1.0)

In [147]:
clf_adab.fit(X_train_scaled, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=15,
                                                         random_state=0),
                   n_estimators=8)

In [148]:
get_results(clf_adab, X_train_scaled, X_test_scaled)

		 ******** RESULTS FOR TRAIN SET ********
                precision    recall  f1-score   support

        Amadey       1.00      0.99      0.99      2175
          Loki       0.99      0.87      0.93      6917
   SmokeLoader       1.00      0.61      0.76      6053
SnakeKeyLogger       0.65      0.99      0.78      6101
         njrat       0.99      0.98      0.99      3045
 non-malicious       1.00      0.98      0.99       768

      accuracy                           0.87     25059
     macro avg       0.94      0.90      0.91     25059
  weighted avg       0.91      0.87      0.87     25059



		 ******** RESULTS FOR TEST SET ********
                precision    recall  f1-score   support

        Amadey       0.87      0.89      0.88       528
          Loki       0.83      0.74      0.78      1769
   SmokeLoader       0.93      0.54      0.68      1526
SnakeKeyLogger       0.54      0.87      0.67      1510
         njrat       0.88      0.82      0.85       718
 non-maliciou