In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

dtypes = {
        'MachineIdentifier':                                    'str',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int32',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int32',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int32',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int32',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float32',
        'HasDetections':                                        'int8'
    }

malware = pd.read_csv('train.csv',dtype=dtypes)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [2]:
malware[['OsBuildLab0', 'OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = malware.loc[:, 'OsBuildLab'].str.split('.', expand=True)
malware.loc[:, 'OsBuildLab0'] = malware['OsBuildLab0'].astype(str)
malware.loc[malware['OsBuildLab0'] == 'nan', 'OsBuildLab0']  = 0
malware['OsBuildLab0'] = malware['OsBuildLab0'].astype(int)
malware.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = malware.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']].astype('category')
 

  malware.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']] = malware.loc[:, ['OsBuildLab1','OsBuildLab2', 'OsBuildLab3', 'OsBuildLab4']].astype('category')


In [3]:
malware['AvSigVersion'] = malware['AvSigVersion'].astype(str)
malware.loc[malware.AvSigVersion.str.contains('2&#x17;3'),'AvSigVersion'] = '1.2173.1144.0'

In [4]:
malware[['Census_OSVersion0', 'Census_OSVersion1','Census_OSVersion2', 'Census_OSVersion3']] = malware.loc[:, 'Census_OSVersion'].str.split('.', expand=True)

In [5]:
malware[['AvSigVersion0', 'AvSigVersion1','AvSigVersion2', 'AvSigVersion3']] = malware.loc[:, 'AvSigVersion'].str.split('.', expand=True)
malware[['AppVersion0', 'AppVersion1','AppVersion2', 'AppVersion3']] = malware.loc[:, 'AppVersion'].str.split('.', expand=True)
malware[['EngineVersion0', 'EngineVersion1','EngineVersion2', 'EngineVersion3']] = malware.loc[:, 'EngineVersion'].str.split('.', expand=True)

In [6]:
malware.loc[5244810, ['AvSigVersion0', 'AvSigVersion1' ,'AvSigVersion2', 'AvSigVersion3']] = [0,0,0,0]

In [7]:
malware.iloc[:, -16:] = malware.iloc[:, -16:].astype(int)

  malware.iloc[:, -16:] = malware.iloc[:, -16:].astype(int)


In [8]:
def remove_columns(data):
    """
    Computing percent of null values in a dataset based on the feature and removing those features having 70 or more than 70 percent of null values
    """
    columns_to_be_removed = []
    percent = (data.isnull().sum()/data.shape[0]) * 100

    for col in data.columns:
        if percent.loc[col] >= 70:
            columns_to_be_removed.append(col)
    new_data = data.drop(columns=columns_to_be_removed)
  
    return new_data

In [9]:
malware = remove_columns(malware)

In [10]:
from sklearn.model_selection import train_test_split
_, malware = train_test_split(malware,
                              stratify=malware['HasDetections'], 
                              test_size=0.1)

In [11]:
import re
malware = malware.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [12]:
from pycaret.classification import *
clf1 = setup(data = malware, 
             target = 'HasDetections',
             ignore_features=['MachineIdentifier', 'Census_OSVersion0','OsBuildLab','EngineVersion', 'AppVersion', 'AvSigVersion','PuaMode','Census_ProcessorClass','DefaultBrowsersIdentifier','Census_IsFlightingInternal','Census_InternalBatteryType'],
             remove_multicollinearity=True, 
             multicollinearity_threshold=0.8 ,
             feature_selection=True,
             use_gpu=True,
             n_features_to_select=40,
             normalize = True,
             low_variance_threshold = 0.1, 
             fold_strategy=skf,
             log_experiment = True, 
             log_plots = True, 
             log_data = True,
             memory=False,
            )

[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device 

Unnamed: 0,Description,Value
0,Session id,6540
1,Target,HasDetections
2,Target type,Binary
3,Original data shape,"(892149, 99)"
4,Transformed data shape,"(892149, 41)"
5,Transformed train set shape,"(624504, 41)"
6,Transformed test set shape,"(267645, 41)"
7,Ignore features,11
8,Numeric features,66
9,Categorical features,26


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] Unknown device type cuda
[LightGBM] [Fatal] Unknown device type cuda


In [13]:
best = compare_models(include = ['xgboost'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.6524,0.715,0.6483,0.6535,0.6509,0.3048,0.3048,28.576


In [16]:
print(best)

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)


In [None]:
evaluate_model(best)

In [None]:
plot_model(best, plot = 'auc')

In [None]:
tuned_dt = tune_model(
    best, n_iter = 10,optimize = 'auc', search_library = 'tune-sklearn' ,choose_better = True, 
    search_algorithm = 'bohb',early_stopping = True
)

print(tuned_dt)

0,1
Current time:,2023-06-18 01:33:55
Running for:,00:00:25.26
Memory:,29.2/125.8 GiB

Trial name,status,loc,actual_estimator__co lsample_bytree,actual_estimator__le arning_rate,actual_estimator__ma x_depth,actual_estimator__mi n_child_weight,actual_estimator__n_ estimators,actual_estimator__re g_alpha,actual_estimator__re g_lambda,actual_estimator__sc ale_pos_weight,actual_estimator__su bsample
_PipelineTrainable_ee956d74,RUNNING,172.17.0.3:38875,0.738177,1.02624e-06,4,2,19,1.59406e-09,1.29462e-05,7.15155,0.238958


In [None]:
tuned_dt.get_all_params()

In [None]:
second_tuned = tune_model(best, n_iter = 250,optimize = 'auc',search_library='optuna',choose_better = True)
print(second_tuned)

In [None]:
best = ensemble_model(best, n_estimators = 60,choose_better = True)

In [None]:
best = ensemble_model(best, n_estimators = 60, method = 'Boosting',choose_better = True)

In [None]:
third_tuned = tune_model(best, n_iter = 250,optimize = 'auc',search_library='tune-sklearn',search_algorithm='bayesian',choose_better = True)
print(third_tuned)