Microsoft_Malware_Prediction.py

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
#Importing Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

#Creating dtypes for fast loading the dataset
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }
# Load data with measuring time. I have downloaded the data from Kaggle
%time train_df = pd.read_csv('train.csv', dtype=dtypes)
# Copy the dataframe to another dataframe. As loading data takes to much time 
# Incase we need to return original data it is better to copy the dataframe and work with it
%time train_df_copy = train_df.copy()
# Split the non-categorical data and copy it to another dataframe
%time train_df_Noncategory = train_df_copy.select_dtypes(exclude='category')
# Check the number of Nan values of each columns
%time train_df_Noncategory.isnull().sum()
# Function to fill Nan values with the maximum repeated value of the column
def fill_with_max(df):
    x = df.value_counts().argmax()
    df.fillna(x,inplace = True)
# Choose some columns that have lowest number of Nan
# There is no logic behind this selection. 
fill_with_max(train_df_Noncategory['AVProductStatesIdentifier'])
fill_with_max(train_df_Noncategory['AVProductsInstalled'])
fill_with_max(train_df_Noncategory['AVProductsEnabled'])
fill_with_max(train_df_Noncategory['Firewall'])
fill_with_max(train_df_Noncategory['Census_FirmwareManufacturerIdentifier'])
fill_with_max(train_df_Noncategory['Census_FirmwareVersionIdentifier'])
#Create NoNull Non-categorical dataframe that will be used in model
%time train_df_Noncategory_NoNull = train_df_Noncategory[['IsBeta','IsSxsPassiveMode','AVProductStatesIdentifier','AVProductsInstalled','AVProductsEnabled','HasTpm','CountryIdentifier','LocaleEnglishNameIdentifier','OsBuild','OsSuite','AutoSampleOptIn','Firewall','Census_HasOpticalDiskDrive','Census_OSBuildNumber','Census_OSBuildRevision','Census_OSUILocaleIdentifier','Census_IsPortableOperatingSystem','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Census_IsSecureBootEnabled','Census_IsTouchEnabled','Census_IsPenCapable']].copy()
# Split the categorical data and copy it to another dataframe
train_df_category = train_df_copy.select_dtypes(include='category')
# Check the number of Nan values of each columns
train_df_category.isnull().sum()
# As i have limited computational power. I choose some of the Nan value included column and fill the Nan values.
fill_with_max(train_df_category['Census_PrimaryDiskTypeName'])
fill_with_max(train_df_category['Census_PowerPlatformRoleName'])
# Create the NoNull Categorical dataframe to use in model
train_df_category.drop(['MachineIdentifier','EngineVersion','AppVersion','OsVer','OsBuildLab','AvSigVersion','PuaMode','SmartScreen','Census_ProcessorClass','Census_ChassisTypeName','Census_InternalBatteryType','Census_OSVersion', 'Census_OSBranch', 'Census_OSEdition','Census_OSSkuName'],axis =1,inplace=True)
# Join the Categorical and non-categorical dataframes.
%time train_df_final = train_df_category.join(train_df_Noncategory_NoNull)

#Label Encoding to categorical columns
start = time.time()
for i in range(0,len(train_df_category.columns)):
    labelencoder_X_i = LabelEncoder()
    train_df_final[train_df_category.columns[i]] = labelencoder_X_i.fit_transform(train_df_final[train_df_category.columns[i]])
    print(train_df_final.columns[i])
end = time.time()
print(end-start)

# Check the data type of the columns
train_df_final.info()

#MAke categorical columns int8 to lower the computation time.
for i in range(0,len(train_df_category.columns)):
    train_df_final[train_df_category.columns[i]] = train_df_final[train_df_category.columns[i]].astype(np.int8)
# OneHotEncoding to categorical columns  
start = time.time()
onehotencoder = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],sparse = False)
train_df_final_coded = onehotencoder.fit_transform(train_df_final)
end = time.time()
print(end-start)
# Create X and y arrays before feed the model
%time X = train_df_final.iloc[:].values
%time y = train_df_copy['HasDetections'].values
# Split arrays to Train and Test
%time X_train, X_test, y_train, y_test = train_test_split(train_df_final_coded, y, test_size = 0.2, random_state = 0)
# Import necessary libraries for model. We use RandomForest from sklearn.
from sklearn.ensemble import RandomForestClassifier
# Classifier model with parameters. You can use other parameters. But be ready for long computational time.
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=100, max_features=0.5, n_jobs=-1, oob_score=False)
# fit the model to train values.
%time m.fit(X_train, y_train)
# Confusion matrix to see how good our model.
cm = confusion_matrix(y_test, m.predict(X_test))
# Check the Confusion matrix
cm
# Before Submission we need to apply model to TEST Data
# Import the test data 
%time X_submission_test = pd.read_csv('test.csv', dtype=dtypes)
# Next 12 lines are the same process that we did on train set.
%time test_df_copy = X_submission_test.copy()
%time test_df_Noncategory = test_df_copy.select_dtypes(exclude='category')
%time test_df_category = test_df_copy.select_dtypes(include='category')
fill_with_max(test_df_Noncategory['AVProductStatesIdentifier'])
fill_with_max(test_df_Noncategory['AVProductsInstalled'])
fill_with_max(test_df_Noncategory['AVProductsEnabled'])
fill_with_max(test_df_Noncategory['Firewall'])
fill_with_max(test_df_Noncategory['Census_FirmwareManufacturerIdentifier'])
fill_with_max(test_df_Noncategory['Census_FirmwareVersionIdentifier'])

%time test_df_Noncategory_NoNull = test_df_Noncategory[['IsBeta','IsSxsPassiveMode','AVProductStatesIdentifier','AVProductsInstalled','AVProductsEnabled','HasTpm','CountryIdentifier','LocaleEnglishNameIdentifier','OsBuild','OsSuite','AutoSampleOptIn','Firewall','Census_HasOpticalDiskDrive','Census_OSBuildNumber','Census_OSBuildRevision','Census_OSUILocaleIdentifier','Census_IsPortableOperatingSystem','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Census_IsSecureBootEnabled','Census_IsTouchEnabled','Census_IsPenCapable']].copy()
fill_with_max(test_df_category['Census_PrimaryDiskTypeName'])
fill_with_max(test_df_category['Census_PowerPlatformRoleName'])
test_df_category.drop(['MachineIdentifier','EngineVersion','AppVersion','OsVer','OsBuildLab','AvSigVersion','PuaMode','SmartScreen','Census_ProcessorClass','Census_ChassisTypeName','Census_InternalBatteryType','Census_OSVersion', 'Census_OSBranch','Census_OSEdition','Census_OSSkuName'],axis =1,inplace=True)
# Test data have some differencies with Train data
# 2 categorical columns have 1 more category than train data. As we have more categories when we apply 
# OneHotEncoding we have more columns than train data. This situation causes an eror 
# when we try to apply our model to test data.
# So we need to decrease category number. I look the data as extra actegory has small number of repetation, 
# I change the category to the nearst value in the column. 
test_df_category['Census_MDC2FormFactor'].replace('Other','IoTOther',inplace=True)
test_df_category['Census_FlightRing'].replace('CBCanary','Canary',inplace=True)
test_df_category['Census_GenuineStateName'].fillna('IS_GENUINE',inplace=True)
# Same as train set create the final dataframe.
%time test_df_final = test_df_category.join(test_df_Noncategory_NoNull)

start = time.time()
for i in range(0,len(test_df_category.columns)):
    labelencoder_X_test_i = LabelEncoder()
    test_df_final[test_df_category.columns[i]] = labelencoder_X_test_i.fit_transform(test_df_final[test_df_category.columns[i]])
    print(test_df_category.columns[i])
end = time.time()
print(end-start)

for i in range(0,len(test_df_category.columns)):
    test_df_final[test_df_category.columns[i]] = test_df_final[test_df_category.columns[i]].astype(np.int8)

start = time.time()
onehotencoder = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],sparse = False)
test_df_final_coded = onehotencoder.fit_transform(test_df_final)
end = time.time()
print(end-start)
# Do a prediction with our model.
# We use predict_proba because competition asks for the probablity of the machine infected by Malware.
pred_prob = m.predict_proba(test_df_final_coded)
# Second column of the pred_prob is the probablity of the infection (probablity of 1)
# We assign this column to HasDetections as having the submission format.
test_df_copy['HasDetections'] = pred_prob[:,-1]
# Also get only MachineIdentifier and HasDetections column to have the format.  
test_df_copy = test_df_copy.loc[:,['MachineIdentifier','HasDetections']]
# Create submission File and submit it to competition.
test_df_copy.to_csv('submission.csv',index=False)