### Importing Dependies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading train data

In [2]:
df=pd.read_csv("../input/malware-detection-va/train.csv",low_memory=False)
y=df['HasDetections']
df.drop('HasDetections',axis=1,inplace=True)

In [3]:
# df["Census_ThresholdOptIn"].value_counts()

### Reading test data

In [4]:
test=pd.read_csv("../input/malware-detection-va/test_to_give.csv",low_memory=False)

### Merging test and train data for consistent preprocessing

In [5]:
maldata=pd.concat([test.assign(ind="test"), df.assign(ind="train")],axis=0,ignore_index=True)

In [6]:
# df["HasDetctions"]

In [7]:
# maldata

In [8]:
# maldata.head()

In [9]:
# maldata.shape

In [10]:
# print(len(maldata.columns))

### Observing null values in dataset

In [33]:
tp=pd.DataFrame((maldata.isnull().sum()/len(maldata))*100).rename(columns={0:"null values"})
tp[tp["null values"]>65]

Unnamed: 0,null values
DefaultBrowsersIdentifier,94.831223
PuaMode,99.983108
Census_ProcessorClass,99.623818
Census_InternalBatteryType,70.533498
Census_IsFlightingInternal,82.772923


### Dropping features with more than 70% null values

In [34]:
maldata.drop(["PuaMode","Census_ProcessorClass","DefaultBrowsersIdentifier","Census_IsFlightingInternal","Census_InternalBatteryType"],axis=1,inplace=True)

### Dropping features with highly imbalanced value_counts()

In [35]:
maldata=maldata.drop(["AutoSampleOptIn","Census_IsFlightsDisabled","Census_IsPortableOperatingSystem","SMode","IsBeta"],axis=1)

In [36]:
# maldata.isnull().sum().sort_values(ascending = False)[:43]

In [37]:
# maldata.skew().sort_values(ascending = False)[:43]

In [38]:
# maldata.drop("MachineIdentifier",axis=1,inplace=True)

In [39]:
# percent = (maldata.isnull().sum()/maldata.shape[0]) * 100
# new_train_data= pd.DataFrame(data=percent,columns=['nullvaluesPercentage'])
# new_train_data = new_train_data.sort_values(by='nullvaluesPercentage',ascending=False)
# print(new_train_data.head(40))

In [40]:
# for i in maldata.columns:
#     print(i)
#     print(maldata[i].value_counts())

In [41]:
# maldata.Census_IsWIMBootEnabled.value_counts()

In [42]:
# maldata.drop("Census_IsWIMBootEnabled",axis=1,inplace=True)

In [43]:
# percent = (maldata.isnull().sum()/maldata.shape[0]) * 100
# new_train_data= pd.DataFrame(data=percent,columns=['nullvaluesPercentage'])
# new_train_data = new_train_data.sort_values(by='nullvaluesPercentage',ascending=False)
# print(new_train_data.head(15))

In [44]:
# maldata.OrganizationIdentifier.value_counts()

In [45]:
maldata.Census_ThresholdOptIn.value_counts()

0.0    298735
1.0        81
Name: Census_ThresholdOptIn, dtype: int64

In [46]:
maldata.drop('Census_ThresholdOptIn',axis=1,inplace=True)

In [47]:
maldata['SmartScreen'].value_counts()

RequireAdmin    427050
ExistsNotSet     54060
Off              17119
Warn             10931
Prompt            3240
Block             1982
off                 98
On                  81
&#x02;              41
&#x01;              34
on                  15
requireadmin         3
prompt               1
Enabled              1
Name: SmartScreen, dtype: int64

### This function handles the duplicate upper and lower case values in the smartscreen feature

In [48]:

def handle_screen(val):
    ''' cleaning category values to reduce number of categories for smartscreen feature '''
    val=str(val)
    if val == 'Block':
        return 'Block'
    elif val == 'ExistsNotSet':
        return 'ExistNotSet'
    elif val == 'Off':
        return 'Off'
    elif val == 'off':
        return 'Off'
    elif val == 'Prompt':
        return 'Prompt'
    elif val == 'prompt':
        return 'Prompt'
    elif val == 'RequireAdmin':
        return 'RequireAdmin'
    elif val == 'requireadmin':
        return 'RequireAdmin'
    elif val == 'Warn':
        return 'Warn'
    elif val == 'On':
        return 'On'
    elif val == 'on':
        return 'On'
    else:
        return 'Unknown'

In [49]:
# maldata.SmartScreen.isnull().sum()

In [50]:
maldata['SmartScreen']=maldata['SmartScreen'].apply(handle_screen)

In [51]:
# maldata['SmartScreen'].isnull().sum()

In [52]:
# maldata['SmartScreen'].value_counts()

In [53]:
# percent = (maldata.isnull().sum()/maldata.shape[0]) * 100
# new_train_data= pd.DataFrame(data=percent,columns=['nullvaluesPercentage'])
# new_train_data = new_train_data.sort_values(by='nullvaluesPercentage',ascending=False)
# print(new_train_data.head(15))

In [54]:
# maldata['OrganizationIdentifier'].value_counts()

### Filling remaining null values with mode

In [55]:
for feature in maldata.columns:
        if feature=="Census_IsWIMBootEnabled":
            maldata[feature] = maldata[feature].fillna(1)
        else:
            maldata[feature] = maldata[feature].fillna((maldata[feature].mode()[0]))

In [56]:
# percent = (maldata.isnull().sum()/maldata.shape[0]) * 100
# new_train_data= pd.DataFrame(data=percent,columns=['nullvaluesPercentage'])
# new_train_data = new_train_data.sort_values(by='nullvaluesPercentage',ascending=False)
# print(new_train_data.head(15))

### Extracting object type columns

In [57]:
cat=[]
for i in maldata.columns:
    if (maldata[i].dtype=="object" and i!='ind' and i!='MachineIdentifier'):
        cat.append(i)
cat

['ProductName',
 'EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'Platform',
 'Processor',
 'OsVer',
 'OsPlatformSubRelease',
 'OsBuildLab',
 'SkuEdition',
 'SmartScreen',
 'Census_MDC2FormFactor',
 'Census_DeviceFamily',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSSkuName',
 'Census_OSInstallTypeName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_GenuineStateName',
 'Census_ActivationChannel',
 'Census_FlightRing']

In [58]:
len(cat)

26

### Replacing values with low count with mode in ProductName

In [59]:
def replace_ProductName(val):
    
    if val=="win8defender" or val=="scep" or val=="windowsintune" or val=="fep":
        return "win8defender"
    else:
        return val

In [60]:
maldata["ProductName"]=maldata["ProductName"].apply(replace_ProductName)

In [61]:
# def replace_EngineVersion(val):
#     if val=="1.1.11502.0":
#         return "1.1.15200.1"
#     else:
#         return val

In [62]:
# def replace_ChasisType(val):
#     if val.isnumeric() or val=="ExpansionChassis" or val=="DockingStation" or val=="CompactPCI":
#         return "Notebook"
    
#     else:
#         return val

In [63]:
# def replace_OSEdition(val):
#     if val[-1]=="N":
#         val=val[:-1]
#     if val[-1]=="S":
#         val=val[:-1]
#     elif val=="Home" or val=="Pro" or  val=="ServerDatacenterACor" or val=="ProfessionalSingleLanguage":
#         val="Core"
#     else:
#         val=val
#     return val

In [64]:
# maldata["EngineVersion"]=maldata["EngineVersion"].apply(replace_EngineVersion)

In [65]:
# maldata["Census_ChassisTypeName"]=maldata["Census_ChassisTypeName"].apply(replace_ChasisType)

In [66]:
# maldata["Census_OSEdition"]=maldata["Census_OSEdition"].apply(replace_OSEdition)

In [67]:
# maldata["Census_OSEdition"].value_counts()

In [68]:
# j=filter(fun,(maldata["AvSigVersion"].value_counts().tolist()))
# sum=0
# for i in j:
#     sum+=1
# sum

### Separating categorical into two lists, yo and cat, for label and one hotencoding respectively

#### One Hot encoding was resulting in exceeded RAM, so we label encoded columns with high dimensionality

In [69]:
yo=["AvSigVersion","OsBuildLab","Census_OSVersion","AppVersion","Census_OSSkuName","ProductName","EngineVersion","Census_ChassisTypeName","Census_OSEdition"]

In [70]:
cat.remove("AvSigVersion")
cat.remove("OsBuildLab")
cat.remove("Census_OSVersion")
cat.remove("ProductName")
cat.remove("EngineVersion")
cat.remove("AppVersion")
cat.remove("Census_ChassisTypeName")
cat.remove("Census_OSEdition")
cat.remove("Census_OSSkuName")

### Applying encoding and obtaining preprocessed dataset

In [71]:
l=maldata.drop(cat+yo,axis=1)

In [72]:
l

Unnamed: 0,MachineIdentifier,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,ind
0,6810c5d22b0973b53a89ef881656e192,7.0,0,7945.0,2.0,1.0,1,159,77866.0,27.0,...,9321.0,1,1.0,0.0,0,0,0.0,0.0,11.0,test
1,4d810281c41ae85517e447146ec15b0a,7.0,0,7945.0,2.0,1.0,1,41,32657.0,27.0,...,63654.0,1,0.0,0.0,0,0,0.0,1.0,10.0,test
2,d0d7e4da90f95d04cdecc0143b690e0a,7.0,0,53447.0,1.0,1.0,1,180,103309.0,27.0,...,9491.0,0,1.0,0.0,0,0,0.0,1.0,3.0,test
3,718b06bd3089b5a37c63ad6af86ee0cd,7.0,0,53447.0,1.0,1.0,1,101,1873.0,27.0,...,2827.0,1,0.0,0.0,0,0,0.0,0.0,9.0,test
4,4ab7e3633628cccb65e055d91979c31b,7.0,0,53447.0,1.0,1.0,1,43,143810.0,18.0,...,33105.0,0,1.0,0.0,0,0,0.0,0.0,7.0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811038,5d5545419eb59bb641e1ee6ed20e90ba,7.0,0,47238.0,2.0,1.0,1,160,159247.0,27.0,...,8789.0,1,0.0,0.0,0,0,0.0,0.0,3.0,train
811039,689744369139991906b4e5f7dc214861,7.0,0,53447.0,1.0,1.0,1,35,119475.0,27.0,...,63122.0,1,1.0,0.0,0,0,0.0,0.0,4.0,train
811040,d0ee1552dbff407eabc699d4bb66f524,7.0,0,49480.0,2.0,1.0,1,160,67457.0,18.0,...,63317.0,1,1.0,0.0,0,0,0.0,1.0,3.0,train
811041,d13206048d326844b5928cf237a1ca64,7.0,0,53447.0,1.0,1.0,1,97,93324.0,27.0,...,6613.0,1,1.0,0.0,1,0,1.0,0.0,15.0,train


In [73]:
len(cat)

17

In [74]:
# cat

In [75]:
# len(cat)

In [76]:
# h=maldata[cat]

In [77]:
# h

In [78]:
# h["ProductName"]

In [79]:
# h["ProductName"].nunique()

In [80]:
# def tp(h):
    
#     for i in h.columns:
#         index=h[h.groupby(i)[i].transform('count').lt(2)].index
#         index
# # h["ProductName"].nunique()
#         h.loc[index,i]=np.NaN
#         h[i]=h[i].fillna(h[i].mode()[0])
# #         maldata['ProductName'].nunique()
# # h.loc[h.groupby('ProductName').ProductName.transform('count').lt(2), 'ProductName'] = np.nan  
# # h['ProductName']
# # h["ProductName"].nunique()
#     return h
# # h["ProductName"].nunique()

In [81]:
# h=tp(h)
# for i in h.columns:
#     print (i)
#     print (h[i].nunique())

In [82]:
# h.nunique()

In [83]:
# len(yo)

### Applying encoding and obtaining preprocessed dataset

In [84]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
enc=LabelEncoder()
enc1=OneHotEncoder(handle_unknown='ignore',sparse=True)

In [85]:
cat_tp=pd.DataFrame()
label_tp=pd.DataFrame()

In [86]:
for i in cat:
#     enc = OneHotEncoder(handle_unknown='ignore',sparse=True)
    enc_df = enc1.fit_transform(maldata[[i]]).toarray()
    names=enc1.get_feature_names([i])
    enc_df=pd.DataFrame(enc_df,columns=names)
    cat_tp=pd.concat([cat_tp,enc_df],axis=1)

In [87]:
for i in yo:
    labelen=maldata[i]
    labelen=pd.DataFrame(enc.fit_transform(labelen))
    labelen=labelen.rename(columns={0:i})
    label_tp=pd.concat([label_tp,labelen],axis=1)

In [88]:
l

Unnamed: 0,MachineIdentifier,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,ind
0,6810c5d22b0973b53a89ef881656e192,7.0,0,7945.0,2.0,1.0,1,159,77866.0,27.0,...,9321.0,1,1.0,0.0,0,0,0.0,0.0,11.0,test
1,4d810281c41ae85517e447146ec15b0a,7.0,0,7945.0,2.0,1.0,1,41,32657.0,27.0,...,63654.0,1,0.0,0.0,0,0,0.0,1.0,10.0,test
2,d0d7e4da90f95d04cdecc0143b690e0a,7.0,0,53447.0,1.0,1.0,1,180,103309.0,27.0,...,9491.0,0,1.0,0.0,0,0,0.0,1.0,3.0,test
3,718b06bd3089b5a37c63ad6af86ee0cd,7.0,0,53447.0,1.0,1.0,1,101,1873.0,27.0,...,2827.0,1,0.0,0.0,0,0,0.0,0.0,9.0,test
4,4ab7e3633628cccb65e055d91979c31b,7.0,0,53447.0,1.0,1.0,1,43,143810.0,18.0,...,33105.0,0,1.0,0.0,0,0,0.0,0.0,7.0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811038,5d5545419eb59bb641e1ee6ed20e90ba,7.0,0,47238.0,2.0,1.0,1,160,159247.0,27.0,...,8789.0,1,0.0,0.0,0,0,0.0,0.0,3.0,train
811039,689744369139991906b4e5f7dc214861,7.0,0,53447.0,1.0,1.0,1,35,119475.0,27.0,...,63122.0,1,1.0,0.0,0,0,0.0,0.0,4.0,train
811040,d0ee1552dbff407eabc699d4bb66f524,7.0,0,49480.0,2.0,1.0,1,160,67457.0,18.0,...,63317.0,1,1.0,0.0,0,0,0.0,1.0,3.0,train
811041,d13206048d326844b5928cf237a1ca64,7.0,0,53447.0,1.0,1.0,1,97,93324.0,27.0,...,6613.0,1,1.0,0.0,1,0,1.0,0.0,15.0,train


In [89]:
label_tp

Unnamed: 0,AvSigVersion,OsBuildLab,Census_OSVersion,AppVersion,Census_OSSkuName,ProductName,EngineVersion,Census_ChassisTypeName,Census_OSEdition
0,4324,219,177,16,1,2,38,20,1
1,6380,231,256,53,14,2,51,27,14
2,6749,226,236,53,14,2,51,15,13
3,6114,222,229,47,1,2,50,15,1
4,7166,231,259,53,14,2,52,15,13
...,...,...,...,...,...,...,...,...,...
811038,6381,226,231,53,1,2,51,27,1
811039,6341,231,258,53,1,2,51,27,1
811040,5854,226,231,41,1,2,48,27,1
811041,6796,161,123,23,1,2,52,38,1


In [90]:
cat_tp

Unnamed: 0,Platform_windows10,Platform_windows2016,Platform_windows7,Platform_windows8,Processor_arm64,Processor_x64,Processor_x86,OsVer_10.0.0.0,OsVer_10.0.0.112,OsVer_10.0.1.0,...,Census_ActivationChannel_Volume:GVLK,Census_ActivationChannel_Volume:MAK,Census_FlightRing_Disabled,Census_FlightRing_NOT_SET,Census_FlightRing_OSG,Census_FlightRing_RP,Census_FlightRing_Retail,Census_FlightRing_Unknown,Census_FlightRing_WIF,Census_FlightRing_WIS
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811038,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
811039,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
811040,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
811041,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [91]:
# for i in cat:
#     print (i)
#     print(maldata[i].nunique())

In [92]:
l=pd.concat([l,cat_tp,label_tp],axis=1)

In [93]:
l

Unnamed: 0,MachineIdentifier,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_FlightRing_WIS,AvSigVersion,OsBuildLab,Census_OSVersion,AppVersion,Census_OSSkuName,ProductName,EngineVersion,Census_ChassisTypeName,Census_OSEdition
0,6810c5d22b0973b53a89ef881656e192,7.0,0,7945.0,2.0,1.0,1,159,77866.0,27.0,...,0.0,4324,219,177,16,1,2,38,20,1
1,4d810281c41ae85517e447146ec15b0a,7.0,0,7945.0,2.0,1.0,1,41,32657.0,27.0,...,0.0,6380,231,256,53,14,2,51,27,14
2,d0d7e4da90f95d04cdecc0143b690e0a,7.0,0,53447.0,1.0,1.0,1,180,103309.0,27.0,...,0.0,6749,226,236,53,14,2,51,15,13
3,718b06bd3089b5a37c63ad6af86ee0cd,7.0,0,53447.0,1.0,1.0,1,101,1873.0,27.0,...,0.0,6114,222,229,47,1,2,50,15,1
4,4ab7e3633628cccb65e055d91979c31b,7.0,0,53447.0,1.0,1.0,1,43,143810.0,18.0,...,0.0,7166,231,259,53,14,2,52,15,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811038,5d5545419eb59bb641e1ee6ed20e90ba,7.0,0,47238.0,2.0,1.0,1,160,159247.0,27.0,...,0.0,6381,226,231,53,1,2,51,27,1
811039,689744369139991906b4e5f7dc214861,7.0,0,53447.0,1.0,1.0,1,35,119475.0,27.0,...,0.0,6341,231,258,53,1,2,51,27,1
811040,d0ee1552dbff407eabc699d4bb66f524,7.0,0,49480.0,2.0,1.0,1,160,67457.0,18.0,...,0.0,5854,226,231,41,1,2,48,27,1
811041,d13206048d326844b5928cf237a1ca64,7.0,0,53447.0,1.0,1.0,1,97,93324.0,27.0,...,0.0,6796,161,123,23,1,2,52,38,1


### Separating back into submisssion test data and train data

In [94]:
test= l[l["ind"].eq("test")].copy()
maldata=l[l["ind"].eq("train")].copy()

In [95]:
test["ind"]

0         test
1         test
2         test
3         test
4         test
          ... 
243308    test
243309    test
243310    test
243311    test
243312    test
Name: ind, Length: 243313, dtype: object

In [96]:
# maldata.reset_index(drop=True)

In [97]:
maldata["ind"]

243313    train
243314    train
243315    train
243316    train
243317    train
          ...  
811038    train
811039    train
811040    train
811041    train
811042    train
Name: ind, Length: 567730, dtype: object

In [98]:
test.drop("ind",axis=1,inplace=True)
maldata.drop("ind",axis=1,inplace=True)

In [99]:
test

Unnamed: 0,MachineIdentifier,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_FlightRing_WIS,AvSigVersion,OsBuildLab,Census_OSVersion,AppVersion,Census_OSSkuName,ProductName,EngineVersion,Census_ChassisTypeName,Census_OSEdition
0,6810c5d22b0973b53a89ef881656e192,7.0,0,7945.0,2.0,1.0,1,159,77866.0,27.0,...,0.0,4324,219,177,16,1,2,38,20,1
1,4d810281c41ae85517e447146ec15b0a,7.0,0,7945.0,2.0,1.0,1,41,32657.0,27.0,...,0.0,6380,231,256,53,14,2,51,27,14
2,d0d7e4da90f95d04cdecc0143b690e0a,7.0,0,53447.0,1.0,1.0,1,180,103309.0,27.0,...,0.0,6749,226,236,53,14,2,51,15,13
3,718b06bd3089b5a37c63ad6af86ee0cd,7.0,0,53447.0,1.0,1.0,1,101,1873.0,27.0,...,0.0,6114,222,229,47,1,2,50,15,1
4,4ab7e3633628cccb65e055d91979c31b,7.0,0,53447.0,1.0,1.0,1,43,143810.0,18.0,...,0.0,7166,231,259,53,14,2,52,15,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243308,5426568056581385156f6c57eb1c16b7,7.0,0,53447.0,1.0,1.0,1,50,115291.0,27.0,...,0.0,6879,222,226,53,1,2,52,27,1
243309,4d05c57f5fa2177e4463b1d93adb9282,7.0,0,61168.0,2.0,1.0,1,57,56452.0,18.0,...,0.0,6037,170,128,47,4,2,50,27,4
243310,6aa49e8161838cca6b0584a18c45f016,7.0,0,53447.0,1.0,1.0,1,141,96055.0,48.0,...,0.0,7160,222,226,23,4,2,52,27,4
243311,690d6524746bb857e27876471a649176,7.0,0,53447.0,1.0,1.0,1,204,130775.0,27.0,...,0.0,6766,226,231,53,1,2,52,15,1


In [100]:
maldata.reset_index(drop=True)

Unnamed: 0,MachineIdentifier,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_FlightRing_WIS,AvSigVersion,OsBuildLab,Census_OSVersion,AppVersion,Census_OSSkuName,ProductName,EngineVersion,Census_ChassisTypeName,Census_OSEdition
0,6ca9b92a49d4d23fc4754d464155cbb1,7.0,0,53447.0,1.0,1.0,1,154,108573.0,27.0,...,0.0,6466,233,259,53,14,2,51,15,13
1,640b82f0a7bd3701818b9442d1432937,7.0,0,53447.0,1.0,1.0,1,39,64466.0,27.0,...,0.0,7160,231,259,53,14,2,52,15,13
2,cf34b73e0b66f6dd249c3b7dff1fb7dd,7.0,0,53447.0,1.0,1.0,1,81,56441.0,18.0,...,0.0,6485,222,220,53,1,2,51,27,1
3,6b59109284f680df09295d51e72e7abf,7.0,0,53447.0,1.0,1.0,1,220,24282.0,27.0,...,0.0,7051,231,259,53,4,2,52,30,4
4,d1e4f6460d095127de936210ab587e3c,7.0,0,53447.0,1.0,1.0,1,51,8750.0,27.0,...,0.0,6443,231,254,53,14,2,51,15,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567725,5d5545419eb59bb641e1ee6ed20e90ba,7.0,0,47238.0,2.0,1.0,1,160,159247.0,27.0,...,0.0,6381,226,231,53,1,2,51,27,1
567726,689744369139991906b4e5f7dc214861,7.0,0,53447.0,1.0,1.0,1,35,119475.0,27.0,...,0.0,6341,231,258,53,1,2,51,27,1
567727,d0ee1552dbff407eabc699d4bb66f524,7.0,0,49480.0,2.0,1.0,1,160,67457.0,18.0,...,0.0,5854,226,231,41,1,2,48,27,1
567728,d13206048d326844b5928cf237a1ca64,7.0,0,53447.0,1.0,1.0,1,97,93324.0,27.0,...,0.0,6796,161,123,23,1,2,52,38,1


In [101]:
# for i in l.columns:
#     print(i)

In [102]:
l=maldata.copy()

In [103]:
# l.reset_index(drop=True)

In [104]:
y

0         0
1         0
2         1
3         0
4         1
         ..
567725    0
567726    0
567727    1
567728    1
567729    1
Name: HasDetections, Length: 567730, dtype: int64

In [105]:
# for i in cat:
#     dumm=pd.get_dummies(maldata[i],prefix=i)
# #     print(l)
#     l=pd.concat([l,dumm],axis=1)

In [None]:
# y=l['HasDetections']

### Dropping MachineIdentifier as it is a unique value for each data entry

In [106]:
l.drop('MachineIdentifier',axis=1,inplace=True)

In [107]:
# l

In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

In [109]:
# l.reset_index(drop=True)

In [110]:
# import re
# l= l.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

### Splitting train data into train and test sets

In [111]:
X_train,X_test,Y_train,Y_test=train_test_split(l,y,stratify=y,random_state=2019,test_size=0.2)

In [112]:
# X_train

### Tries Random Forets also.

In [113]:
# model=RandomForestClassifier(n_estimators=1200, min_samples_leaf=1000, max_leaf_nodes=150, n_jobs=-1,class_weight='balanced_subsample')

In [114]:
import xgboost as xgb

In [115]:
# xgb_params = {}
# xgb_params['learning_rate'] = 0.1
# xgb_params['n_estimators'] = 230
# xgb_params['max_depth'] = 12
# xgb_params['num_leaves']=150,
# xgb_params['min_data_in_leaf']:300,

# # xgb_params['subsample'] = 0.9
# xgb_params['colsample_bytree'] = 0.176
# # xgb_params['scale_pos_weight']=4

In [116]:
from lightgbm import LGBMClassifier

In [117]:
# clf = LGBMClassifier(objective='binary',n_jobs=-1,n_estimators=230)
# clf.fit(X_train,Y_train)

In [118]:
# model.fit(X_train,Y_train)

In [119]:
# output=model.predict_proba(X_test)[:,1]

In [120]:
# from sklearn.metrics import roc_auc_score

In [121]:
# print(2*roc_auc_score(Y_test,output)-1)

In [122]:
# model=xgb.XGBClassifier(**xgb_params)

In [123]:
# scaler=StandardScaler()
# X_train=scaler.fit_transform(X_train)
# X_test=scaler.fit_transform(X_test)

In [124]:
# clf.fit(X_train,Y_train)

In [125]:
# output=clf.predict_proba(X_test)[:,1]

In [126]:
# print(2*roc_auc_score(Y_test,output)-1)

In [127]:
# model.fit(X_train,Y_train)

### We tuned hyperparameters using GridSearchCV and RandomizedSearchCV

In [128]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Hyperparameters for LightGBM Classifier

In [129]:
parameters = {
              'learning_rate': 0.1,
              'n_estimators':230,
              'boosting_type':'gbdt',
              'objective':['binary'],
              'colsample_bytree':0.176,
              'num_leaves':150,
              'min_data_in_leaf':300,
              'max_depth':12,
              'n_jobs':[-1],
#               'reg_lambda':0.6,
#               'reg_alpha':3,
                
              }

In [130]:
# clf = GridSearchCV(model,parameters)

In [131]:
model= LGBMClassifier(**parameters)

In [132]:
# model1=LGBMClassifier(**parameters1)

In [133]:
# model2=LGBMClassifier(**parameters2)

### We tried xgboost,RandomForest and logisitic regression also but LighGBM+EasyEnsemble outperformed them.

In [134]:
# import xgboost as xgb

In [135]:
# clf=xgb.XGBClassifier(learning_rate=0.1, 
#                             n_estimators=230, 
#                             max_depth=12,
# #                            
#                             colsample_bytree=0.176,
#                             objective= 'binary:logistic',
#                             nthread=-1,
#                             scale_pos_weight=4,
#                             booster='gbtree',
# #                            
#                             tree_method = 'hist',
#                             seed=42)

In [136]:
# pd.DataFrame(Y_test).value_counts()

In [137]:
# Y_test.value_counts()

In [138]:
# 17032/96514

### Using EasyEnsembleClassifier for better score and avoid overfitting.

In [139]:
from imblearn.ensemble import EasyEnsembleClassifier
clf=EasyEnsembleClassifier(n_estimators=30,base_estimator=model,random_state=42,n_jobs=-1,sampling_strategy='majority',verbose=True)

In [140]:
# model.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10)

In [141]:
# clf.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10,early_stopping_rounds=10)

### Training the model

In [142]:
clf.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.5min remaining:  2.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


EasyEnsembleClassifier(base_estimator=LGBMClassifier(colsample_bytree=0.176,
                                                     max_depth=12,
                                                     min_data_in_leaf=300,
                                                     n_estimators=230,
                                                     n_jobs=[-1],
                                                     num_leaves=150,
                                                     objective=['binary']),
                       n_estimators=30, n_jobs=-1, random_state=42,
                       sampling_strategy='majority', verbose=True)

### Making Predictions

In [143]:
output=clf.predict_proba(X_test)[:,1]

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   59.0s remaining:   59.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.0min finished


### Final training roc_auc score

In [144]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, output,pos_label=1)
auc_score=metrics.auc(fpr, tpr)
print (auc_score)

0.7170152210618284


### This gave us a final score of 0.71820 on the  private leaderboard

### Scores we got during training

In [147]:
# 0.7168935719666678

In [148]:
# 0.7167725823085187

In [149]:
# 0.7165580523619851

In [150]:
# 0.716907921413368 n_est=500, lr=0.09

In [151]:
# 0.716430307732827 -

In [152]:
# for i in test.columns:
#     if i not in l.columns:
#         print(i)

In [153]:
# 0.7057052859268755 - lightgbm, n_est=230,col=0.176,'gbdt'

In [154]:
# 0.704918546547196 - loghtgbm, n_est=230

In [155]:
# 0.7048313925777644 - n_est=250

In [156]:
# 0.704784706079872 - n_est=200

In [157]:
# 0.7041714241843127- nestimators=150

In [158]:
# 0.7028641891640863 - n_est=100

In [159]:
# 0.7025317936726614

**Test**

### Dropping MachineIdentifier from test also and saving it to use later in submission

In [160]:
id=test["MachineIdentifier"]

In [161]:


test.drop("MachineIdentifier",axis=1,inplace=True)

In [162]:
test

Unnamed: 0,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,...,Census_FlightRing_WIS,AvSigVersion,OsBuildLab,Census_OSVersion,AppVersion,Census_OSSkuName,ProductName,EngineVersion,Census_ChassisTypeName,Census_OSEdition
0,7.0,0,7945.0,2.0,1.0,1,159,77866.0,27.0,194.0,...,0.0,4324,219,177,16,1,2,38,20,1
1,7.0,0,7945.0,2.0,1.0,1,41,32657.0,27.0,52.0,...,0.0,6380,231,256,53,14,2,51,27,14
2,7.0,0,53447.0,1.0,1.0,1,180,103309.0,27.0,233.0,...,0.0,6749,226,236,53,14,2,51,15,13
3,7.0,0,53447.0,1.0,1.0,1,101,1873.0,27.0,129.0,...,0.0,6114,222,229,47,1,2,50,15,1
4,7.0,0,53447.0,1.0,1.0,1,43,143810.0,18.0,53.0,...,0.0,7166,231,259,53,14,2,52,15,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243308,7.0,0,53447.0,1.0,1.0,1,50,115291.0,27.0,68.0,...,0.0,6879,222,226,53,1,2,52,27,1
243309,7.0,0,61168.0,2.0,1.0,1,57,56452.0,18.0,76.0,...,0.0,6037,170,128,47,4,2,50,27,4
243310,7.0,0,53447.0,1.0,1.0,1,141,96055.0,48.0,167.0,...,0.0,7160,222,226,23,4,2,52,27,4
243311,7.0,0,53447.0,1.0,1.0,1,204,130775.0,27.0,257.0,...,0.0,6766,226,231,53,1,2,52,15,1


In [163]:
# def replace_ChasisType1(val):
#     val=str(val)
#     if val.isnumeric():
#         return "Notebook"
#     elif val=="nan":
#         return "UNKNOWN"
#     else:
#         return val

In [164]:
# test["Census_ChassisTypeName"]=test["Census_ChassisTypeName"].apply(replace_ChasisType1)

In [165]:
# test['Census_OSEdition']=test['Census_OSEdition'].apply(replace_OSEdition)

In [166]:
# test=scaler.fit_transform(test)

### Predicting on test data and making submission file.

In [167]:
output_1=clf.predict_proba(test)[:,1]

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.0min remaining:  2.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.1min finished


In [168]:
submit1=pd.DataFrame(output_1)

In [169]:
submit1.value_counts()

0.324073    4
0.605319    3
0.489545    3
0.510779    3
0.551003    2
           ..
0.521505    1
0.521505    1
0.521499    1
0.521494    1
0.027629    1
Length: 243289, dtype: int64

In [170]:
submit1=pd.concat([id,submit1],axis=1)

In [171]:
submit1

Unnamed: 0,MachineIdentifier,0
0,6810c5d22b0973b53a89ef881656e192,0.307397
1,4d810281c41ae85517e447146ec15b0a,0.325236
2,d0d7e4da90f95d04cdecc0143b690e0a,0.609203
3,718b06bd3089b5a37c63ad6af86ee0cd,0.358510
4,4ab7e3633628cccb65e055d91979c31b,0.454381
...,...,...
243308,5426568056581385156f6c57eb1c16b7,0.473864
243309,4d05c57f5fa2177e4463b1d93adb9282,0.254508
243310,6aa49e8161838cca6b0584a18c45f016,0.547567
243311,690d6524746bb857e27876471a649176,0.510053


In [172]:
submit1=submit1.rename(columns={0:'HasDetections'})

In [173]:
submit1

Unnamed: 0,MachineIdentifier,HasDetections
0,6810c5d22b0973b53a89ef881656e192,0.307397
1,4d810281c41ae85517e447146ec15b0a,0.325236
2,d0d7e4da90f95d04cdecc0143b690e0a,0.609203
3,718b06bd3089b5a37c63ad6af86ee0cd,0.358510
4,4ab7e3633628cccb65e055d91979c31b,0.454381
...,...,...
243308,5426568056581385156f6c57eb1c16b7,0.473864
243309,4d05c57f5fa2177e4463b1d93adb9282,0.254508
243310,6aa49e8161838cca6b0584a18c45f016,0.547567
243311,690d6524746bb857e27876471a649176,0.510053


In [174]:
id

0         6810c5d22b0973b53a89ef881656e192
1         4d810281c41ae85517e447146ec15b0a
2         d0d7e4da90f95d04cdecc0143b690e0a
3         718b06bd3089b5a37c63ad6af86ee0cd
4         4ab7e3633628cccb65e055d91979c31b
                        ...               
243308    5426568056581385156f6c57eb1c16b7
243309    4d05c57f5fa2177e4463b1d93adb9282
243310    6aa49e8161838cca6b0584a18c45f016
243311    690d6524746bb857e27876471a649176
243312    5a5b683158867d8173257efdacb74982
Name: MachineIdentifier, Length: 243313, dtype: object

In [175]:
submit1.to_csv("./assignment_output36.csv",index=False)