-
Notifications
You must be signed in to change notification settings - Fork 0
/
Microsoft_Malware_Prediction.py
228 lines (219 loc) · 15 KB
/
Microsoft_Malware_Prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#Importing Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
#Creating dtypes for fast loading the dataset
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
# Load data with measuring time. I have downloaded the data from Kaggle
%time train_df = pd.read_csv('train.csv', dtype=dtypes)
# Copy the dataframe to another dataframe. As loading data takes to much time
# Incase we need to return original data it is better to copy the dataframe and work with it
%time train_df_copy = train_df.copy()
# Split the non-categorical data and copy it to another dataframe
%time train_df_Noncategory = train_df_copy.select_dtypes(exclude='category')
# Check the number of Nan values of each columns
%time train_df_Noncategory.isnull().sum()
# Function to fill Nan values with the maximum repeated value of the column
def fill_with_max(df):
x = df.value_counts().argmax()
df.fillna(x,inplace = True)
# Choose some columns that have lowest number of Nan
# There is no logic behind this selection.
fill_with_max(train_df_Noncategory['AVProductStatesIdentifier'])
fill_with_max(train_df_Noncategory['AVProductsInstalled'])
fill_with_max(train_df_Noncategory['AVProductsEnabled'])
fill_with_max(train_df_Noncategory['Firewall'])
fill_with_max(train_df_Noncategory['Census_FirmwareManufacturerIdentifier'])
fill_with_max(train_df_Noncategory['Census_FirmwareVersionIdentifier'])
#Create NoNull Non-categorical dataframe that will be used in model
%time train_df_Noncategory_NoNull = train_df_Noncategory[['IsBeta','IsSxsPassiveMode','AVProductStatesIdentifier','AVProductsInstalled','AVProductsEnabled','HasTpm','CountryIdentifier','LocaleEnglishNameIdentifier','OsBuild','OsSuite','AutoSampleOptIn','Firewall','Census_HasOpticalDiskDrive','Census_OSBuildNumber','Census_OSBuildRevision','Census_OSUILocaleIdentifier','Census_IsPortableOperatingSystem','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Census_IsSecureBootEnabled','Census_IsTouchEnabled','Census_IsPenCapable']].copy()
# Split the categorical data and copy it to another dataframe
train_df_category = train_df_copy.select_dtypes(include='category')
# Check the number of Nan values of each columns
train_df_category.isnull().sum()
# As i have limited computational power. I choose some of the Nan value included column and fill the Nan values.
fill_with_max(train_df_category['Census_PrimaryDiskTypeName'])
fill_with_max(train_df_category['Census_PowerPlatformRoleName'])
# Create the NoNull Categorical dataframe to use in model
train_df_category.drop(['MachineIdentifier','EngineVersion','AppVersion','OsVer','OsBuildLab','AvSigVersion','PuaMode','SmartScreen','Census_ProcessorClass','Census_ChassisTypeName','Census_InternalBatteryType','Census_OSVersion', 'Census_OSBranch', 'Census_OSEdition','Census_OSSkuName'],axis =1,inplace=True)
# Join the Categorical and non-categorical dataframes.
%time train_df_final = train_df_category.join(train_df_Noncategory_NoNull)
#Label Encoding to categorical columns
start = time.time()
for i in range(0,len(train_df_category.columns)):
labelencoder_X_i = LabelEncoder()
train_df_final[train_df_category.columns[i]] = labelencoder_X_i.fit_transform(train_df_final[train_df_category.columns[i]])
print(train_df_final.columns[i])
end = time.time()
print(end-start)
# Check the data type of the columns
train_df_final.info()
#MAke categorical columns int8 to lower the computation time.
for i in range(0,len(train_df_category.columns)):
train_df_final[train_df_category.columns[i]] = train_df_final[train_df_category.columns[i]].astype(np.int8)
# OneHotEncoding to categorical columns
start = time.time()
onehotencoder = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],sparse = False)
train_df_final_coded = onehotencoder.fit_transform(train_df_final)
end = time.time()
print(end-start)
# Create X and y arrays before feed the model
%time X = train_df_final.iloc[:].values
%time y = train_df_copy['HasDetections'].values
# Split arrays to Train and Test
%time X_train, X_test, y_train, y_test = train_test_split(train_df_final_coded, y, test_size = 0.2, random_state = 0)
# Import necessary libraries for model. We use RandomForest from sklearn.
from sklearn.ensemble import RandomForestClassifier
# Classifier model with parameters. You can use other parameters. But be ready for long computational time.
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=100, max_features=0.5, n_jobs=-1, oob_score=False)
# fit the model to train values.
%time m.fit(X_train, y_train)
# Confusion matrix to see how good our model.
cm = confusion_matrix(y_test, m.predict(X_test))
# Check the Confusion matrix
cm
# Before Submission we need to apply model to TEST Data
# Import the test data
%time X_submission_test = pd.read_csv('test.csv', dtype=dtypes)
# Next 12 lines are the same process that we did on train set.
%time test_df_copy = X_submission_test.copy()
%time test_df_Noncategory = test_df_copy.select_dtypes(exclude='category')
%time test_df_category = test_df_copy.select_dtypes(include='category')
fill_with_max(test_df_Noncategory['AVProductStatesIdentifier'])
fill_with_max(test_df_Noncategory['AVProductsInstalled'])
fill_with_max(test_df_Noncategory['AVProductsEnabled'])
fill_with_max(test_df_Noncategory['Firewall'])
fill_with_max(test_df_Noncategory['Census_FirmwareManufacturerIdentifier'])
fill_with_max(test_df_Noncategory['Census_FirmwareVersionIdentifier'])
%time test_df_Noncategory_NoNull = test_df_Noncategory[['IsBeta','IsSxsPassiveMode','AVProductStatesIdentifier','AVProductsInstalled','AVProductsEnabled','HasTpm','CountryIdentifier','LocaleEnglishNameIdentifier','OsBuild','OsSuite','AutoSampleOptIn','Firewall','Census_HasOpticalDiskDrive','Census_OSBuildNumber','Census_OSBuildRevision','Census_OSUILocaleIdentifier','Census_IsPortableOperatingSystem','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Census_IsSecureBootEnabled','Census_IsTouchEnabled','Census_IsPenCapable']].copy()
fill_with_max(test_df_category['Census_PrimaryDiskTypeName'])
fill_with_max(test_df_category['Census_PowerPlatformRoleName'])
test_df_category.drop(['MachineIdentifier','EngineVersion','AppVersion','OsVer','OsBuildLab','AvSigVersion','PuaMode','SmartScreen','Census_ProcessorClass','Census_ChassisTypeName','Census_InternalBatteryType','Census_OSVersion', 'Census_OSBranch','Census_OSEdition','Census_OSSkuName'],axis =1,inplace=True)
# Test data have some differencies with Train data
# 2 categorical columns have 1 more category than train data. As we have more categories when we apply
# OneHotEncoding we have more columns than train data. This situation causes an eror
# when we try to apply our model to test data.
# So we need to decrease category number. I look the data as extra actegory has small number of repetation,
# I change the category to the nearst value in the column.
test_df_category['Census_MDC2FormFactor'].replace('Other','IoTOther',inplace=True)
test_df_category['Census_FlightRing'].replace('CBCanary','Canary',inplace=True)
test_df_category['Census_GenuineStateName'].fillna('IS_GENUINE',inplace=True)
# Same as train set create the final dataframe.
%time test_df_final = test_df_category.join(test_df_Noncategory_NoNull)
start = time.time()
for i in range(0,len(test_df_category.columns)):
labelencoder_X_test_i = LabelEncoder()
test_df_final[test_df_category.columns[i]] = labelencoder_X_test_i.fit_transform(test_df_final[test_df_category.columns[i]])
print(test_df_category.columns[i])
end = time.time()
print(end-start)
for i in range(0,len(test_df_category.columns)):
test_df_final[test_df_category.columns[i]] = test_df_final[test_df_category.columns[i]].astype(np.int8)
start = time.time()
onehotencoder = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],sparse = False)
test_df_final_coded = onehotencoder.fit_transform(test_df_final)
end = time.time()
print(end-start)
# Do a prediction with our model.
# We use predict_proba because competition asks for the probablity of the machine infected by Malware.
pred_prob = m.predict_proba(test_df_final_coded)
# Second column of the pred_prob is the probablity of the infection (probablity of 1)
# We assign this column to HasDetections as having the submission format.
test_df_copy['HasDetections'] = pred_prob[:,-1]
# Also get only MachineIdentifier and HasDetections column to have the format.
test_df_copy = test_df_copy.loc[:,['MachineIdentifier','HasDetections']]
# Create submission File and submit it to competition.
test_df_copy.to_csv('submission.csv',index=False)