In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold



In [2]:
dtypes = {
    'MachineIdentifier':                                    'str',
    'ProductName':                                          'category',
    'EngineVersion':                                        'category',
    'AppVersion':                                           'category',
    'AvSigVersion':                                         'category',
    'IsBeta':                                               'int8',
    'RtpStateBitfield':                                     'float16',
    'IsSxsPassiveMode':                                     'int8',
    'DefaultBrowsersIdentifier':                            'float16',
    'AVProductStatesIdentifier':                            'float32',
    'AVProductsInstalled':                                  'float16',
    'AVProductsEnabled':                                    'float16',
    'HasTpm':                                               'int8',
    'CountryIdentifier':                                    'int32',
    'CityIdentifier':                                       'float32',
    'OrganizationIdentifier':                               'float16',
    'GeoNameIdentifier':                                    'float32',
    'LocaleEnglishNameIdentifier':                          'int32',
    'Platform':                                             'category',
    'Processor':                                            'category',
    'OsVer':                                                'category',
    'OsBuild':                                              'int16',
    'OsSuite':                                              'int16',
    'OsPlatformSubRelease':                                 'category',
    'OsBuildLab':                                           'category',
    'SkuEdition':                                           'category',
    'IsProtected':                                          'float16',
    'AutoSampleOptIn':                                      'int8',
    'PuaMode':                                              'category',
    'SMode':                                                'float16',
    'IeVerIdentifier':                                      'float32',
    'SmartScreen':                                          'category',
    'Firewall':                                             'float16',
    'UacLuaenable':                                         'float64',
    'Census_MDC2FormFactor':                                'category',
    'Census_DeviceFamily':                                  'category',
    'Census_OEMNameIdentifier':                             'float32',
    'Census_OEMModelIdentifier':                            'float32',
    'Census_ProcessorCoreCount':                            'float16',
    'Census_ProcessorManufacturerIdentifier':               'float16',
    'Census_ProcessorModelIdentifier':                      'float32',
    'Census_ProcessorClass':                                'category',
    'Census_PrimaryDiskTotalCapacity':                      'float32',
    'Census_PrimaryDiskTypeName':                           'category',
    'Census_SystemVolumeTotalCapacity':                     'float32',
    'Census_HasOpticalDiskDrive':                           'int8',
    'Census_TotalPhysicalRAM':                              'float32',
    'Census_ChassisTypeName':                               'category',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
    'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
    'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
    'Census_PowerPlatformRoleName':                         'category',
    'Census_InternalBatteryType':                           'category',
    'Census_InternalBatteryNumberOfCharges':                'float32',
    'Census_OSVersion':                                     'category',
    'Census_OSArchitecture':                                'category',
    'Census_OSBranch':                                      'category',
    'Census_OSBuildNumber':                                 'int32',
    'Census_OSBuildRevision':                               'int32',
    'Census_OSEdition':                                     'category',
    'Census_OSSkuName':                                     'category',
    'Census_OSInstallTypeName':                             'category',
    'Census_OSInstallLanguageIdentifier':                   'float16',
    'Census_OSUILocaleIdentifier':                          'int32',
    'Census_OSWUAutoUpdateOptionsName':                     'category',
    'Census_IsPortableOperatingSystem':                     'int8',
    'Census_GenuineStateName':                              'category',
    'Census_ActivationChannel':                             'category',
    'Census_IsFlightingInternal':                           'float16',
    'Census_IsFlightsDisabled':                             'float16',
    'Census_FlightRing':                                    'category',
    'Census_ThresholdOptIn':                                'float16',
    'Census_FirmwareManufacturerIdentifier':                'float32',
    'Census_FirmwareVersionIdentifier':                     'float32',
    'Census_IsSecureBootEnabled':                           'int8',
    'Census_IsWIMBootEnabled':                              'float16',
    'Census_IsVirtualDevice':                               'float16',
    'Census_IsTouchEnabled':                                'int8',
    'Census_IsPenCapable':                                  'int8',
    'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
    'Wdft_IsGamer':                                         'float16',
    'Wdft_RegionIdentifier':                                'float32',
    'HasDetections':                                        'int8'
}

train = pd.read_csv("/kaggle/input/microsoft-malware-prediction/train.csv", dtype=dtypes)
train[['OsBuildLab0', 'OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = train.loc[:, 'OsBuildLab'].str.split('.', expand=True)
train.loc[:, 'OsBuildLab0'] = train['OsBuildLab0'].astype(str)
train.loc[train['OsBuildLab0'] == 'nan', 'OsBuildLab0']  = 0
train['OsBuildLab0'] = train['OsBuildLab0'].astype(int)
train.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = train.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']].astype('category')
train['AvSigVersion'] = train['AvSigVersion'].astype(str)
train.loc[train.AvSigVersion.str.contains('2&#x17;3'),'AvSigVersion'] = '1.2173.1144.0'
train[['Census_OSVersion0', 'Census_OSVersion1','Census_OSVersion2', 'Census_OSVersion3']] = train.loc[:, 'Census_OSVersion'].str.split('.', expand=True)
train[['AvSigVersion0', 'AvSigVersion1','AvSigVersion2', 'AvSigVersion3']] = train.loc[:, 'AvSigVersion'].str.split('.', expand=True)
train[['AppVersion0', 'AppVersion1','AppVersion2', 'AppVersion3']] = train.loc[:, 'AppVersion'].str.split('.', expand=True)
train[['EngineVersion0', 'EngineVersion1','EngineVersion2', 'EngineVersion3']] = train.loc[:, 'EngineVersion'].str.split('.', expand=True)
train.loc[5244810, ['AvSigVersion0', 'AvSigVersion1' ,'AvSigVersion2', 'AvSigVersion3']] = [0,0,0,0]
train.iloc[:, -16:] = train.iloc[:, -16:].astype(int)

def remove_columns(data):
    columns_to_be_removed = ['MachineIdentifier', 'Census_OSVersion0','OsBuildLab','EngineVersion', 'AppVersion', 'AvSigVersion','PuaMode','Census_ProcessorClass','DefaultBrowsersIdentifier','Census_IsFlightingInternal','Census_InternalBatteryType']
    percent = (data.isnull().sum()/data.shape[0]) * 100

    for col in data.columns:
        if percent.loc[col] >= 70:
            columns_to_be_removed.append(col)
    
    data.drop(columns=columns_to_be_removed, inplace=True)

remove_columns(train)
train

  train.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = train.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']].astype('category')
  train.iloc[:, -16:] = train.iloc[:, -16:].astype(int)


Unnamed: 0,ProductName,IsBeta,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,...,AvSigVersion2,AvSigVersion3,AppVersion0,AppVersion1,AppVersion2,AppVersion3,EngineVersion0,EngineVersion1,EngineVersion2,EngineVersion3
0,win8defender,0,7.0,0,53447.0,1.0,1.0,1,29,128035.0,...,1735,0,4,18,1807,18075,1,1,15100,1
1,win8defender,0,7.0,0,53447.0,1.0,1.0,1,93,1482.0,...,48,0,4,13,17134,1,1,1,14600,4
2,win8defender,0,7.0,0,53447.0,1.0,1.0,1,86,153579.0,...,1341,0,4,18,1807,18075,1,1,15100,1
3,win8defender,0,7.0,0,53447.0,1.0,1.0,1,88,20710.0,...,1527,0,4,18,1807,18075,1,1,15100,1
4,win8defender,0,7.0,0,53447.0,1.0,1.0,1,18,37376.0,...,1379,0,4,18,1807,18075,1,1,15100,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8921478,win8defender,0,7.0,0,53447.0,1.0,1.0,1,66,84963.0,...,1555,0,4,18,1807,18075,1,1,15100,1
8921479,win8defender,0,7.0,0,59914.0,2.0,1.0,1,66,82414.0,...,4218,0,4,9,10586,672,1,1,13303,0
8921480,win8defender,0,7.0,0,53447.0,1.0,1.0,1,43,134580.0,...,1242,0,4,18,1807,18075,1,1,15200,1
8921481,win8defender,0,7.0,0,6630.0,3.0,1.0,1,207,159430.0,...,1834,0,4,16,17656,18052,1,1,14901,4


In [3]:
dtypes = train.dtypes.to_dict()


CATEGORICAL_FEATURES = []
NUMERIC_FEATURES = []
NUM_CATEGORIES = {}
LABEL = "HasDetections"

for col, val in dtypes.items():
    if col == "HasDetections": continue
    if val.type.__name__ == "CategoricalDtypeType":
        CATEGORICAL_FEATURES.append(col)
        NUM_CATEGORIES[col] = len(train[col].cat.categories)
        train[col] = train[col].cat.codes
        
    else:
        NUMERIC_FEATURES.append(col)
        train[col] = train[col].astype(float)

FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
            
print(CATEGORICAL_FEATURES, NUMERIC_FEATURES, NUM_CATEGORIES, sep="\n\n\n")

['ProductName', 'Platform', 'Processor', 'OsVer', 'OsPlatformSubRelease', 'SkuEdition', 'SmartScreen', 'Census_MDC2FormFactor', 'Census_DeviceFamily', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_OSVersion', 'Census_OSArchitecture', 'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel', 'Census_FlightRing', 'OsBuildLab1', 'OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']


['IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 'AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled', 'HasTpm', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite', 'IsProtected', 'AutoSampleOptIn', 'SMode', 'IeVerIdentifier', 'Firewall', 'UacLuaenable', 'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount', 'Censu

In [4]:
train = train[FEATURES+[LABEL]]
train.to_csv("data.csv", index=False)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for i, (train_index, test_index) in enumerate(skf.split(train[FEATURES], train[LABEL])):
    train.iloc[train_index].to_csv(f"fold{i}_train.csv", index=False)
    train.iloc[test_index].to_csv(f"fold{i}_test.csv", index=False)