In [1]:
#Basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

#specific imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

sns.set()

In [2]:
#import dataframes
df1 = pd.read_csv('ransom_csv.csv')
df2 = pd.read_csv('benign_csv.csv')

#label the dataframes
df1['label'] = 'Ransomware'
df2['label'] = 'Benign'

#drop extra first column
df1 = df1.drop(df1.columns[0], axis=1)
df2 = df2.drop(df2.columns[0], axis=1)

#remove the name column (string)
df1 = df1.drop('Name', axis=1)
df2 = df2.drop('Name', axis=1)

frames = [df1, df2]
df3 = pd.concat(frames)

#shuffle!
df3 = df3.sample(frac=1)
df1 = df1.sample(frac=1)
df2 = df2.sample(frac=1)

In [3]:
#Visualize data
df3.head()
#df2.describe()

Unnamed: 0,READ_CALENDAR,WRITE_CALENDAR,CAMERA,READ_CONTACTS,WRITE_CONTACTS,GET_ACCOUNTS,ACCESS_FINE_LOCATION,ACCESS_COARSE_LOCATION,RECORD_AUDIO,READ_PHONE_STATE,...,shareIntent,readPhoneNumber,FileInputStream,getSharedPreferences,telephony,getMessageBody,getDisplayOriginatingAddress,wallpaper,crypto,label
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Benign
1311,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Ransomware
1380,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Ransomware
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,13,0,0,0,0,5,Benign
750,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Benign


In [4]:
#Define Validation method
#Train and validation set split
from sklearn.model_selection import train_test_split

array = df3.values
X = array[:, 0:-1]
y = array[:, -1]

X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=12)

In [5]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))



LR: 0.980533 (0.007019)
LDA: 0.974186 (0.007588)
KNN: 0.980529 (0.005740)
CART: 0.985963 (0.004272)
NB: 0.916226 (0.014301)
SVM: 0.977357 (0.009701)
RF: 0.987320 (0.004440)


In [6]:
# Make predictions on validation dataset
model = RandomForestClassifier()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [7]:
# Evaluate predictions
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.9945750452079566
[[194   0]
 [  3 356]]
              precision    recall  f1-score   support

      Benign       0.98      1.00      0.99       194
  Ransomware       1.00      0.99      1.00       359

    accuracy                           0.99       553
   macro avg       0.99      1.00      0.99       553
weighted avg       0.99      0.99      0.99       553

