<a href="https://colab.research.google.com/github/vijayrgopu/ml-kaggle/blob/master/MicrosoftMalwareDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Microsoft Malware Detection**

https://www.kaggle.com/c/microsoft-malware-prediction

In [0]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import missingno.missingno as ms
import lightgbm as lgb 
import gc

# **Download Data from Kaggle**



In [0]:
if not os.path.isfile('./train.csv') and not os.path.isfile('./test.csv'):
  # Install Kaggle API for download competition data
  !pip3 install -q kaggle
  # enter your Kaggle credentionals here
  os.environ['KAGGLE_USERNAME']="Your Kaggel User"
  os.environ['KAGGLE_KEY']="Your Kaggle Key"
  # If you are unable to download the competition dataset, check to see if you have 
  # accepted the user agreement on the competition website. 
  !kaggle competitions download -c microsoft-malware-prediction -q
  !unzip test.csv.zip 
  !unzip train.csv.zip
  !chmod 777 *.csv

In [0]:
def lines_in_file(file_name):
  #returns lines in file
  file_len = 0
  with open('test.csv','r') as file:
    for line in file:
      file_len += 1
  return file_len

In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
#using datatypes so we can load the data instead of letting pandas predict
dtypes = {
'MachineIdentifier':                                    'category',
'ProductName':                                          'category',
'EngineVersion':                                        'category',
'AppVersion':                                           'category',
'AvSigVersion':                                         'category',
'IsBeta':                                               'int8',
'RtpStateBitfield':                                     'float16',
'IsSxsPassiveMode':                                     'int8',
'DefaultBrowsersIdentifier':                            'float16',
'AVProductStatesIdentifier':                            'float32',
'AVProductsInstalled':                                  'float16',    

'AVProductsEnabled':                                    'float16',
'HasTpm':                                               'int8',
'CountryIdentifier':                                    'int16',
'CityIdentifier':                                       'float32',
'OrganizationIdentifier':                               'float16',
'GeoNameIdentifier':                                    'float16',
'LocaleEnglishNameIdentifier':                          'int8',
'Platform':                                             'category',
'Processor':                                            'category',
'OsVer':                                                'category',
    
'OsBuild':                                              'int16',
'OsSuite':                                              'int16',
'OsPlatformSubRelease':                                 'category',
'OsBuildLab':                                           'category',
'SkuEdition':                                           'category',
'IsProtected':                                          'float16',
'AutoSampleOptIn':                                      'int8',
'PuaMode':                                              'category',
'SMode':                                                'float16',
'IeVerIdentifier':                                      'float16',
    
'SmartScreen':                                          'category',
'Firewall':                                             'float16',
'UacLuaenable':                                         'float32',
'Census_MDC2FormFactor':                                'category',
'Census_DeviceFamily':                                  'category',
'Census_OEMNameIdentifier':                             'float16',
'Census_OEMModelIdentifier':                            'float32',
'Census_ProcessorCoreCount':                            'float16',
'Census_ProcessorManufacturerIdentifier':               'float16',
'Census_ProcessorModelIdentifier':                      'float16',
'Census_ProcessorClass':                                'category',
'Census_PrimaryDiskTotalCapacity':                      'float32',
'Census_PrimaryDiskTypeName':                           'category',
'Census_SystemVolumeTotalCapacity':                     'float32',
'Census_HasOpticalDiskDrive':                           'int8',
'Census_TotalPhysicalRAM':                              'float32',
'Census_ChassisTypeName':                               'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
'Census_PowerPlatformRoleName':                         'category',
'Census_InternalBatteryType':                           'category',
'Census_InternalBatteryNumberOfCharges':                'float32',
'Census_OSVersion':                                     'category',
'Census_OSArchitecture':                                'category',
'Census_OSBranch':                                      'category',
'Census_OSBuildNumber':                                 'int16',
'Census_OSBuildRevision':                               'int32',
'Census_OSEdition':                                     'category',
'Census_OSSkuName':                                     'category',
'Census_OSInstallTypeName':                             'category',
'Census_OSInstallLanguageIdentifier':                   'float16',
'Census_OSUILocaleIdentifier':                          'int16',
'Census_OSWUAutoUpdateOptionsName':                     'category',
'Census_IsPortableOperatingSystem':                     'int8',
'Census_GenuineStateName':                              'category',
'Census_ActivationChannel':                             'category',
'Census_IsFlightingInternal':                           'float16',
'Census_IsFlightsDisabled':                             'float16',
'Census_FlightRing':                                    'category',
'Census_ThresholdOptIn':                                'float16',
'Census_FirmwareManufacturerIdentifier':                'float16',
'Census_FirmwareVersionIdentifier':                     'float32',
'Census_IsSecureBootEnabled':                           'float16',
'Census_IsWIMBootEnabled':                              'float16',    
'Census_IsVirtualDevice':                               'float16',
'Census_IsTouchEnabled':                                'int8',
'Census_IsPenCapable':                                  'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
'Wdft_IsGamer':                                         'float16',
'Wdft_RegionIdentifier':                                'float16',
'HasDetections':                                        'int8'
}

### Although some columns are int and float but they are still categorical eg. CityIdentifier, CountryIdentifier, So it makes sense to treat all Identifier columns as Categorical

In [0]:
#Features to drop, calculated from feature importance, these are features that have importance < 100
feat_lt_100_imp = [
 #'LocaleEnglishNameIdentifier',
 'IsSxsPassiveMode',
 'Census_OSArchitecture',
 'Platform',
 #'AVProductsInstalled',1
 'RtpStateBitfield',
 #'Census_IsAlwaysOnAlwaysConnectedCapable',1
 'Census_OSWUAutoUpdateOptionsName',
 'Census_IsPenCapable',
 'Census_OSBranch',
 'Census_OSEdition',
 #'Census_IsVirtualDevice',1
 'Census_FlightRing',
 'OsBuild',
 'Census_ChassisTypeName',
 'Census_MDC2FormFactor',
 'OrganizationIdentifier',
 'Census_PrimaryDiskTypeName',
 'Census_ProcessorClass',
 'UacLuaenable',
 'Census_IsFlightsDisabled',
 'SkuEdition',
 'ProductName',
 'AVProductsEnabled',
 'Census_OSBuildNumber',
 'Census_OSSkuName',
 'OsPlatformSubRelease',
 'HasTpm',
 'OsSuite',
 'Census_ProcessorManufacturerIdentifier',
 'Census_InternalBatteryType',
 'Census_PowerPlatformRoleName',
 'Census_IsPortableOperatingSystem',
 'IsBeta',
 'Census_DeviceFamily',
 'OsVer',
 'AutoSampleOptIn',
 'MachineIdentifier']


In [0]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

In [0]:
categorical_columns.extend(['DefaultBrowsersIdentifier','AVProductStatesIdentifier','CountryIdentifier','CityIdentifier',
                            'OrganizationIdentifier','GeoNameIdentifier','LocaleEnglishNameIdentifier','IeVerIdentifier',
                            'Census_OEMNameIdentifier','Census_OEMModelIdentifier','Census_ProcessorManufacturerIdentifier',
                            'Census_ProcessorModelIdentifier','Census_OSInstallLanguageIdentifier','Census_OSUILocaleIdentifier',
                            'Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Wdft_RegionIdentifier',
                            'OsBuild','OsSuite','Census_OSBuildNumber','Census_OSBuildRevision','RtpStateBitfield',
                            'Census_OSBuildNumber','Census_OSBuildRevision','AVProductsInstalled','AVProductsEnabled'])
#removing PUAMode as this is only a single value
categorical_columns.remove('PuaMode')

In [0]:
#removing unimportant features from categorical columns
#categorical_columns = [c for c in categorical_columns if c not in feat_lt_100_imp]



# > **Feature Engineering: Factorizing Categorical Columns**

Note: There are columns that have data as float but are still categorical



In [0]:
# I have initially factorized identifier columns,object type columns before and hence created new columns 
# with fac_ as a prefix to each column name and later renamed them back to the original column name.
# I am now reusing the dictonary created earlier. These columns are identified to be factorized because 
# these are categorical in nature.

In [0]:
#"fac_MachineIdentifier":"MachineIdentifier",
rename_fac_cols = {
                   "fac_ProductName":"ProductName",
                   "fac_EngineVersion":"EngineVersion",
                   "fac_AppVersion":"AppVersion",
                   "fac_AvSigVersion":"AvSigVersion",
                   "fac_Platform":"Platform",
                   "fac_Processor":"Processor",
                   "fac_OsVer":"OsVer",
                   "fac_OsPlatformSubRelease":"OsPlatformSubRelease",
                   "fac_OsBuildLab":"OsBuildLab",
                   "fac_SkuEdition":"SkuEdition",
                   "fac_SmartScreen":"SmartScreen",
                   "fac_Census_MDC2FormFactor":"Census_MDC2FormFactor",
                   "fac_Census_DeviceFamily":"Census_DeviceFamily",
                   "fac_Census_ProcessorClass":"Census_ProcessorClass",
                   "fac_Census_PrimaryDiskTypeName":"Census_PrimaryDiskTypeName",
                   "fac_Census_ChassisTypeName":"Census_ChassisTypeName",
                   "fac_Census_PowerPlatformRoleName":"Census_PowerPlatformRoleName",
                   "fac_Census_InternalBatteryType":"Census_InternalBatteryType",
                   "fac_Census_OSVersion":"Census_OSVersion",
                   "fac_Census_OSArchitecture":"Census_OSArchitecture",
                   "fac_Census_OSBranch":"Census_OSBranch",
                   "fac_Census_OSEdition":"Census_OSEdition",
                   "fac_Census_OSSkuName":"Census_OSSkuName",
                   "fac_Census_OSInstallTypeName":"Census_OSInstallTypeName",
                   "fac_Census_OSWUAutoUpdateOptionsName":"Census_OSWUAutoUpdateOptionsName",
                   "fac_Census_GenuineStateName":"Census_GenuineStateName",
                   "fac_Census_ActivationChannel":"Census_ActivationChannel",
                   "fac_Census_FlightRing":"Census_FlightRing"}

In [0]:
#rename_fac_cols = {k: v for k,v in rename_fac_cols.items() if v in categorical_columns}

In [0]:
#Label Encoder 
#This is done seperately for memory management purposes so that all columns can be used in training the model
from sklearn import preprocessing

tot_train_rows = lines_in_file('train.csv')
tot_test_rows = lines_in_file('test.csv')
skip_rows = 1
chunk_size = tot_train_rows
encoder_list = []
fillna_val_by_feature = {}

le = preprocessing.LabelEncoder()

data = pd.read_csv('train.csv',nrows=chunk_size,skiprows=range(1,skip_rows),dtype=dtypes)
data = reduce_mem_usage(data)

#dropping PuaMode and MachineIdentifier columns
data.drop(labels=['PuaMode'],axis=1,inplace=True)
#data.drop(labels=['MachineIdentifier'],axis=1,inplace=True)
  
data2 = pd.read_csv('test.csv',nrows=tot_test_rows,skiprows=range(1,skip_rows),dtype=dtypes)
data2 = reduce_mem_usage(data2)

#inverting my factorized col dictionary created so i can use the col_names to get the factorized col names as dictionary
inv_fac_col_dict = {value: key for key, value in rename_fac_cols.items()}

for fac_col in rename_fac_cols.values():
  #if fac_col == 'MachineIdentifier' : continue #factorizing machineidentifier column as well because lgb requires all features to be numeric
  new_fac_col = inv_fac_col_dict[fac_col]
    
  #data[new_fac_col], uniq_values = pd.factorize(data[fac_col],na_sentinel=-1)
  #d1 = data[fac_col].unique().tolist()
  #d2 = data2[fac_col].unique().tolist()
  #d1 = d1.extend(d2)
  encoder_list.extend(data[fac_col].unique().tolist())
  encoder_list.extend(data2[fac_col].unique().tolist())
  fillna_val_by_feature[fac_col] = data[fac_col].value_counts().keys()[0]
  
  
le.fit(encoder_list)
  
#Resetting
data2 = pd.DataFrame()
d1 = []
d2 = []
encoder_list = []
gc.collect()
  


# Create Train, Test Split and Train the model

In [0]:
#'max_depth':'11',
#max_bin:'31'
#lambda_l2:'0.4'
params = {'num_leaves':'200',
          'boosting_type':'gbdt',
          'objective':'binary',
          'num_trees':'200',
          'metric':'auc',
          'num_threads':'1',
          'device_type':'cpu',
          'max_bin':'65',
          'seed':'42',
          'max_depth':'6',#previous value 11
          'bagging_fraction':'0.897',
          'bagging_freq':'1', #previous value is 4
          'bagging_seed':'10',
          'feature_fraction':'0.66',
          'lambda_l1':'0.1',
          'lambda_l2':'0.1', #Added newly
          'num_boost_round':'1000',
          'early_stopping_round':'200'}



1.   Best iteration is: [35]	valid_0's auc: 0.697111
2.   Best iteration is: [37]	valid_0's auc: 0.697268
3.   Best iteration is: [33]	valid_0's auc: 0.698462 with feature fraction = 0.7
4.  Best iteration is: [41]	  valid_0's auc: 0.698692 with feature fraction = 0.6
5.  Best iteration is: [41]	  valid_0's auc: 0.69909 with feature fraction = 0.65
6.  Best iteration is: [41]	  valid_0's auc: 0.699402 with feature fraction = 0.66, Bagging_Seed=13
7. Best iteration is: [41] valid_0's auc: 0.699744 with feature fraction = 0.66, Bagging_Seed=10
8. Best iteration is: [41] valid_0's auc: 0.699955 with feature fraction = 0.66, Bagging_Seed=10, Bagging_Fraction = 0.8, Bagging_Freq = 2

9.   Best iteration is: [41] valid_0's auc: 0.700314 with feature fraction = 0.66, Bagging_Seed=10, Bagging_Fraction = 0.857, Bagging_Freq = 3
2.   Best iteration is: [44] valid_0's auc: 0.700776 with feature fraction = 0.66, Bagging_Seed=10, Bagging_Fraction = 0.89, Bagging_Freq = 3


11.   Best iteration is: [41] valid_0's auc: 0.701124 with feature fraction = 0.66, Bagging_Seed=10, Bagging_Fraction = 0.897, Bagging_Freq = 4
12.   Changing the data types to the appropriate types also increased the AUC score to 0.701542








In [0]:
from sklearn import model_selection

while skip_rows <= tot_train_rows:
  
  #dropping puamode column as it has only 1 value
  #data.drop(labels=['PuaMode'],axis=1,inplace=True)

  #dropping columns that has importance less than 100
  #data.drop(labels=feat_lt_100_imp,axis=1,inplace=True) #Next best option to not pull these columns in 
  #data2.drop(labels=feat_lt_100_imp,axis=1,inplace=True)
  
  #inverting my factorized col dictionary created so i can use the col_names to get the factorized col names as dictionary
  inv_fac_col_dict = {value: key for key, value in rename_fac_cols.items()}

  for fac_col in rename_fac_cols.values():
    #if fac_col == 'MachineIdentifier' : continue #factorizing machineidentifier column as well because lgb requires all features to be numeric
    #print(fac_col)
    new_fac_col = inv_fac_col_dict[fac_col]
    
    #fillna
    data[fac_col].fillna(data[fac_col].value_counts().keys()[0],inplace=True)
   
    data[new_fac_col] = le.transform(data[fac_col])

    data.drop(labels=fac_col,axis=1,inplace=True) #dropping the old factorized column and retaining the new one
    
  
  #rename the columns after label transforming  
  data.rename(mapper=rename_fac_cols,axis=1,inplace=True)
  
  #Reducing memory to fit all columns and performing mem reduction after Label Encoding
  data = reduce_mem_usage(data)

  #train_test_split
  X_train, X_test, y_train, y_test = model_selection.train_test_split(
                                                   data.drop(labels='HasDetections',axis=1),
                                                   data['HasDetections'],
                                                   test_size=0.15, #testsize=0.25 earlier
                                                   random_state=42,
                                                   shuffle=True,
                                                   stratify=data['HasDetections'])
  #introduced for memory management
  data = pd.DataFrame()
  #data2 = pd.DataFrame()
  
  y_train = y_train.to_frame()
  y_test = y_test.to_frame()
  col_names = X_train.columns.tolist()
  
  
  #train_data for lgb
  train_data = lgb.Dataset(X_train,label=y_train['HasDetections'],
                         feature_name=col_names,
                         categorical_feature=categorical_columns,
                         free_raw_data=False)
  #Memory Management
  X_train = pd.DataFrame()
  y_train = pd.DataFrame()
  
  #test_data for lgb
  test_data = lgb.Dataset(X_test,label=y_test['HasDetections'],
                         feature_name=col_names,
                         categorical_feature=categorical_columns,
                         free_raw_data=False)
  #Memory Management
  X_test = pd.DataFrame()
  y_test = pd.DataFrame()
  
  #garbage_collect
  gc.collect()
  
  #train_model
  num_round = 10000
  model = lgb.train(params,train_data,num_round,valid_sets=[test_data])
  
  skip_rows += chunk_size
  
#Releasing memory
train_data = ''
test_data = ''
gc.collect()




# **> Predict on Validation/Production Dataset**



In [0]:
#Prediction and generating submission file
!rm submit.csv
submission = pd.DataFrame()

tot_test_rows = lines_in_file('test.csv')
skip_rows = 1
chunk_size = 500000

while skip_rows <= tot_test_rows:
  data2 = pd.read_csv('test.csv',nrows=chunk_size,skiprows=range(1,skip_rows),dtype=dtypes)
  data2 = reduce_mem_usage(data2)
  
  submission = pd.DataFrame()

  submission['MachineIdentifier'] = data2['MachineIdentifier']

  data2.drop(labels=['PuaMode'],axis=1,inplace=True)

  for fac_col in rename_fac_cols.values():
    new_fac_col = inv_fac_col_dict[fac_col]

    #fillna
    #print(fac_col)
    fna_val = fillna_val_by_feature.get(fac_col)
    data2[fac_col].fillna(fna_val,inplace=True)
    #data2[fac_col].fillna(data2[fac_col].value_counts().keys()[0],inplace=True)

    #transforming data using label encoder
    data2[new_fac_col] = le.transform(data2[fac_col])

    #dropping transformed columns and retaining encoded columns
    data2.drop(labels=fac_col,axis=1,inplace=True)

  #renaming column names of encoded columns  
  data2.rename(mapper=rename_fac_cols,axis=1,inplace=True)

  #reducing memory after transforming using label Encoder
  data2 = reduce_mem_usage(data2)

  #Predict  
  submission['HasDetections'] = model.predict(data2,num_iteration=model.best_iteration)
  
  #save results
  if skip_rows == 1:
    submission.to_csv('submit.csv',mode='w',index=False)
  else:
    submission.to_csv('submit.csv',mode='a',index=False,header=False)
  submission = submission.drop(labels=['MachineIdentifier','HasDetections'],axis=1,inplace=True)
  
  #incrementing lines to be read in the next loop
  skip_rows += chunk_size



# > Download & Submit Submission File to Kaggle



In [0]:
!pip3 install -q kaggle
  # enter your Kaggle credentionals here
os.environ['KAGGLE_USERNAME']="vijaygopu"
os.environ['KAGGLE_KEY']="6be506f7edd4deb262d9721e2d02d11f"
!kaggle competitions submit -c microsoft-malware-prediction -f submit.csv -m RevertedL1L2Values

In [0]:
df_feat_imp = pd.DataFrame() 
df_feat_imp['Feature_Importance'] = model.feature_importance()
df_feat_imp['Feature'] = data2.columns

In [0]:
df_feat_imp.sort_values(by='Feature_Importance',ascending=False)

In [0]:
df_imp_feat = df_feat_imp[df_feat_imp.Feature_Importance < 100].sort_values(by=['Feature_Importance'],ascending=False)

In [0]:
feat_lt_100 = df_imp_feat['Feature'].tolist()

In [0]:
feat_lt_100.remove('MachineIdentifier')

In [0]:
feat_lt_100

In [0]:
model.best_score

In [0]:
#predict[:5]

In [0]:
y_test.head(5)

In [0]:
gc.collect()

In [0]:
!ls -ltr

# **GridSearchCV Results**

**param_grid={'learning_rate': [0.1, 0.01, 0.05], 'n_estimators': [40, 200], 'cv': [3, 7], 'num_leaves': [7, 15, 40]}**

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=3, num_leaves=31, objective='binary',
        random_state=42, reg_alpha=5, reg_lambda=10, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'learning_rate': [0.1, 0.01, 0.05], 'n_estimators': [40, 200], 'cv': [3, 7], 'num_leaves': [7, 15, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [0]:
tune_model = lgb.LGBMClassifier(boosting_type = 'gbdt',
                           #num_leaves = 80,
                           max_depth = -1,
                           learning_rate = 0.05,
                           #n_estimators = 100,
                           objective = 'binary',
                           random_state = 42,
                           reg_alpha = 5,
                           reg_lambda = 10,
                           n_jobs=3)

In [0]:
params_grid = {
    'learning_rate': [0.04,0.05,0.06],
    'n_estimators': [40,200],
    'cv':[5,7],
    'num_leaves':[7,15,40]
    }

In [0]:
tune_model = model_selection.GridSearchCV(tune_model,scoring='roc_auc',n_jobs=3,param_grid=params_grid)

In [0]:
tune_model.fit(X_train.drop(labels='HasDetections',axis=1),X_train['HasDetections'])

In [0]:
feature_importances = tune_model.best_estimator_.feature_importances_

In [0]:
data.columns