In [164]:
# import packages
import numpy as np
import pandas as pd
import keras.models
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [165]:
# Reading the csv file into a pandas dataframe
dataframe = pd.read_csv('data.csv', sep="|", low_memory = False)
dataframe.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18,1


In [166]:
dataframe.shape

(138047, 57)

In [167]:
# These features have a different dtype and aren't very useful for prediction so they can be removed
X = dataframe.drop(['Name','md5','legitimate'], axis=1)
X.head()

Unnamed: 0,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,...,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
0,332,224,258,9,0,361984,115712,0,6135,4096,...,0,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16
1,332,224,3330,9,0,130560,19968,0,81778,4096,...,0,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18
2,332,224,3330,9,0,517120,621568,0,350896,4096,...,1,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18
3,332,224,258,9,0,585728,369152,0,451258,4096,...,1,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18
4,332,224,258,9,0,294912,247296,0,217381,4096,...,1,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18


In [168]:
categorical_columns = ['ExportNb','SizeOfOptionalHeader', 'MajorLinkerVersion','Machine', 'MinorLinkerVersion', 
                      'VersionInformationSize']
X_categorical = X[categorical_columns]
X = X.drop(categorical_columns, axis=1)

In [169]:
# Storing the labels into a seperate list
y = dataframe['legitimate'].values
print(y)

[1 1 1 ..., 0 0 0]


In [170]:
from sklearn.decomposition import PCA
from sklearn import model_selection, preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [174]:
# Apply one hot encoding ot the categorical features
hot_enc = OneHotEncoder()
X_one_hot = hot_enc.fit_transform(X_categorical)
# X_one_hot.shape
print(X_one_hot)

  (0, 791)	1.0
  (0, 719)	1.0
  (0, 716)	1.0
  (0, 684)	1.0
  (0, 670)	1.0
  (0, 0)	1.0
  (1, 793)	1.0
  (1, 719)	1.0
  (1, 716)	1.0
  (1, 684)	1.0
  (1, 670)	1.0
  (1, 0)	1.0
  (2, 793)	1.0
  (2, 719)	1.0
  (2, 716)	1.0
  (2, 684)	1.0
  (2, 670)	1.0
  (2, 1)	1.0
  (3, 793)	1.0
  (3, 719)	1.0
  (3, 716)	1.0
  (3, 684)	1.0
  (3, 670)	1.0
  (3, 1)	1.0
  (4, 793)	1.0
  :	:
  (138042, 0)	1.0
  (138043, 790)	1.0
  (138043, 743)	1.0
  (138043, 716)	1.0
  (138043, 677)	1.0
  (138043, 670)	1.0
  (138043, 0)	1.0
  (138044, 789)	1.0
  (138044, 719)	1.0
  (138044, 716)	1.0
  (138044, 685)	1.0
  (138044, 670)	1.0
  (138044, 0)	1.0
  (138045, 781)	1.0
  (138045, 743)	1.0
  (138045, 716)	1.0
  (138045, 677)	1.0
  (138045, 670)	1.0
  (138045, 0)	1.0
  (138046, 781)	1.0
  (138046, 719)	1.0
  (138046, 716)	1.0
  (138046, 686)	1.0
  (138046, 670)	1.0
  (138046, 0)	1.0


In [172]:
# It's best to normalize the data before applying pca so that all of the values are in the range 0 to 1
X_normalize = pd.DataFrame(preprocessing.normalize(X))
X_normalize.shape

(138047, 48)

In [173]:
X = pd.concat([X_normalize, X_one_hot])

TypeError: cannot concatenate a non-NDFrame object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_one_hot,y,test_size=0.2)

In [None]:
model = Sequential()

model.add(Dense(64,input_dim=55, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
num_epochs = 10
model.fit(X_train, y_train, batch_size=32, initial_epoch=num_epochs)

In [None]:
val_score = model.evaluate(X_test, y_test, batch_size=128)
print(val_score)