# Group project: Microsoft Malware Prediction

####  Group 10:  Alejandra Zambrano, Chenxin Xie, Manoj Kumar Purushothaman

### 1. Load libraries

In [4]:
Sys.setenv(LANG = "en")

# Data processing library

library(data.table)       # Data manipulation
library(plyr)             # Data manipulation
library(dplyr)            # Data manipulation
library(tidyr)            # Data manipulation
library(stringr)          # String, text processing
library(vita)             # Quickly check variable importance
library(dataPreparation)  # Data preparation library
library(rlist)            # Data manipulation
library(regclass)
library(tibble)


# Machine learning library
library(mlr)           # Machine learning framework
library(caret)         # Data processing and machine learning framework
library(MASS)          # LDA
library(randomForest)  # RF
library(gbm)           # Boosting Tree
library(xgboost)       # XGboost
library(parallel)      # mlr framework
library(parallelMap)   # mlr framwork
library(mlrMBO)        # mlr controller

In [5]:
# install.packages("regclass")

In [6]:
memory.limit(size=1000000000)

### 2. Data summary and Processing

### 2.1 Data summary

#### Read in data
#### Here we read a subset (20%) of the full dataset and working only with 447.000 observations (almost 5% of the dataset in Kaggle)

In [7]:
# Read data
train_full <- read.csv('./sub_train.csv')

In [8]:
set.seed(123)
train_ind <- sample(seq_len(nrow(train_full)), size = 447000)
train <- train_full[train_ind,]
train <- remove_rownames(train)

In [9]:
head(train)

MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
b17311335a5a2b3b709597a2bd3b0e13,win8defender,1.1.14901.4,4.16.17656.18052,1.269.1000.0,0,7,0,,53447,...,42130,0,,0,0,0,0,0,3,0
134b2d2ad43e5a2b8572447863033bcb,win8defender,1.1.15200.1,4.16.17656.18052,1.275.687.0,0,7,0,,53447,...,9296,0,,0,0,0,0,0,15,1
a8225b2bd182f400f517e959b1221e99,win8defender,1.1.15100.1,4.12.16299.15,1.273.309.0,0,7,0,,47238,...,33103,1,,0,0,0,0,0,1,1
624c5478a184517f786dba122df164eb,win8defender,1.1.15200.1,4.18.1807.18075,1.275.230.0,0,7,0,,47238,...,50302,1,,0,0,0,0,0,3,1
b6cb201455818a249f05f1bc59240d79,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1620.0,0,7,0,,53447,...,56711,1,,0,0,0,0,1,12,0
cab997f1b99144e150926d60a2c08ee6,win8defender,1.1.13504.0,4.11.15063.447,1.237.0.0,0,7,0,3195.0,61100,...,33054,1,,0,0,0,0,0,11,1


In [10]:
dim(train)

In [11]:
str(train)

'data.frame':	447000 obs. of  83 variables:
 $ MachineIdentifier                                : Factor w/ 1784297 levels "0000028988387b115f69f31a3bf04f09",..: 1237518 134058 1172598 685285 1274894 1413785 1697371 1242203 1618268 1638025 ...
 $ ProductName                                      : Factor w/ 5 levels "fep","mse","mseprerelease",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ EngineVersion                                    : Factor w/ 59 levels "1.1.11104.0",..: 52 56 55 56 55 22 55 55 55 55 ...
 $ AppVersion                                       : Factor w/ 102 levels "4.10.14393.0",..: 44 44 21 57 57 18 57 57 27 57 ...
 $ AvSigVersion                                     : Factor w/ 7864 levels "0.0.0.0","1.155.266.0",..: 6284 7731 7177 7566 7085 2175 7296 7069 7081 7044 ...
 $ IsBeta                                           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ RtpStateBitfield                                 : int  7 7 7 7 7 7 7 7 7 7 ...
 $ IsSxsPassiveMode                              

In [12]:
# We can observe that the target variable is balanced
table(train$HasDetections)


     0      1 
223924 223076 

### 2.2 Preprocessing data

#### Check missing values

In [13]:
stats<-c("unique_values","perc_missing_values", "perc_biggest_category", "type")
df<-data.frame(matrix(ncol=length(stats), nrow=length(colnames(train))))
names(df)<-stats
rownames(df) <- colnames(train)
df["type"] <- sapply(train, class)
for (i in colnames(train)){
    df[i,1] <- length(unique(train[[i]]))
    df[i,2] <- round(sum(is.na(train[i]))/nrow(train),2)
    df[i,3] <- round(max(table(train[i]))/nrow(train),2)

}                          

In [14]:
df <- df[order(-df["perc_missing_values"],-df["perc_biggest_category"]),]

In [15]:
df

Unnamed: 0,unique_values,perc_missing_values,perc_biggest_category,type
DefaultBrowsersIdentifier,516,0.95,0.01,integer
Census_IsFlightingInternal,2,0.83,0.17,integer
Census_ThresholdOptIn,3,0.63,0.37,integer
Census_IsWIMBootEnabled,2,0.63,0.37,integer
OrganizationIdentifier,46,0.31,0.47,integer
SMode,3,0.06,0.94,integer
CityIdentifier,35053,0.04,0.01,integer
Wdft_IsGamer,3,0.03,0.69,integer
Census_InternalBatteryNumberOfCharges,4797,0.03,0.57,numeric
Wdft_RegionIdentifier,16,0.03,0.20,integer


The variable DefaultBrowsersIdentifier have +90% missing values, which means that these column is useless and should be dropped. Also, other variables have +50% missing values, we will remove them.We can observe in some variables that one category contains more than +90% of the total values, we will remove these imbalanced columns, take into the account that our target variable is balanced.

In [16]:
newdf <- df[(df["perc_biggest_category"]<=0.9)&(df["perc_missing_values"]<=0.5),]

In [17]:
variables<-rownames(newdf)
train_red<-train[,variables]

Most of the variables are type numeric or integers however, looking at their unique values it seems that they could be treated as categorical variables. The variables with missing values will be filling as "no_info"

In [18]:
train_red[is.na(train_red)] <- 'no_info'

In [19]:
head(train_red)

OrganizationIdentifier,CityIdentifier,Wdft_IsGamer,Census_InternalBatteryNumberOfCharges,Wdft_RegionIdentifier,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_InternalPrimaryDisplayResolutionVertical,Census_InternalPrimaryDisplayResolutionHorizontal,Census_TotalPhysicalRAM,...,Census_OSUILocaleIdentifier,Census_OSInstallTypeName,LocaleEnglishNameIdentifier,GeoNameIdentifier,Census_OSVersion,Census_OSBuildRevision,CountryIdentifier,Census_ProcessorModelIdentifier,AvSigVersion,MachineIdentifier
no_info,153669,0,0,3,142,42130,768,1366,3072,...,11,UUPUpgrade,34,39,10.0.17134.48,48,21,2055,1.269.1000.0,b17311335a5a2b3b709597a2bd3b0e13
27,5113,0,0,15,897,9296,945,1680,6144,...,109,UUPUpgrade,55,24,10.0.16299.125,125,19,2457,1.275.687.0,134b2d2ad43e5a2b8572447863033bcb
no_info,87570,0,0,1,554,33103,768,1366,4096,...,31,Other,75,158,10.0.16299.371,371,139,1998,1.273.309.0,a8225b2bd182f400f517e959b1221e99
27,3137,0,4294967295,3,628,50302,900,1440,4096,...,125,Upgrade,182,211,10.0.17134.228,228,110,2094,1.275.230.0,624c5478a184517f786dba122df164eb
27,100788,1,4294967295,12,142,56711,1080,1920,12288,...,30,Clean,74,276,10.0.17134.228,228,68,2673,1.273.1620.0,b6cb201455818a249f05f1bc59240d79
no_info,75425,0,0,11,554,33054,1080,1920,12288,...,30,Other,74,224,10.0.15063.483,483,108,3063,1.237.0.0,cab997f1b99144e150926d60a2c08ee6


In [20]:
# Converting as factor all variables except MachineIdentifier
train_red["MachineIdentifier"]<-as.character(train_red$MachineIdentifier)

In [21]:
for (i in colnames(train_red)){
    if (i!= "MachineIdentifier"){
        train_red[i] <- as.factor(train_red[[i]])
    }
    
}

In [22]:
str(train_red)

'data.frame':	447000 obs. of  54 variables:
 $ OrganizationIdentifier                           : Factor w/ 46 levels "1","10","11",..: 46 16 46 16 16 46 16 16 8 46 ...
 $ CityIdentifier                                   : Factor w/ 35053 levels "100000","100004",..: 12598 23662 32116 19411 165 29460 31005 31559 10122 35053 ...
 $ Wdft_IsGamer                                     : Factor w/ 3 levels "0","1","no_info": 1 1 1 1 2 1 1 2 2 1 ...
 $ Census_InternalBatteryNumberOfCharges            : Factor w/ 4797 levels "0","1","10","100",..: 1 1 1 2727 2727 1 2727 2727 2154 2727 ...
 $ Wdft_RegionIdentifier                            : Factor w/ 16 levels "1","10","11",..: 9 7 1 9 4 3 7 9 1 7 ...
 $ Census_FirmwareManufacturerIdentifier            : Factor w/ 306 levels "1005","1006",..: 39 289 167 194 39 167 39 39 39 39 ...
 $ Census_FirmwareVersionIdentifier                 : Factor w/ 22622 levels "10003","10011",..: 10944 22315 7510 13844 16093 7488 5158 18142 20311 309 ...
 $ Census_

We will exclude the categorical variables with more than 400 levels. Computing dummy variables in these variables or performance a grouped process is computer expensive and R does not support it. The variable Census_SystemVolumeTotalCapacity could be convert into numerical, it has a lot of levels and there is not missing values.

In [23]:
v_list<-list()
for (i in colnames(train_red)){
    if (length(levels(train_red[[i]])) <400)
     v_list=list.append(v_list,i)   
}
v_list<-unlist(v_list)

In [24]:
# Get the IV and DV list name
# Dependent variable (DV)
dv_list <- c('HasDetections')
# Independent variable (IV)
iv_list <- setdiff(v_list, dv_list)  # Exclude the target variable
iv_list <- setdiff(iv_list, 'MachineIdentifier')  # Exclude the MachineIdentifier

In [25]:
trainRed <- train_red[,c('MachineIdentifier','HasDetections',iv_list)]

In [26]:
trainRed$Census_SystemVolumeTotalCapacity <- as.numeric(train_red$Census_SystemVolumeTotalCapacity)

In [27]:
head(trainRed)

MachineIdentifier,HasDetections,OrganizationIdentifier,Wdft_IsGamer,Wdft_RegionIdentifier,Census_FirmwareManufacturerIdentifier,IeVerIdentifier,Census_OSInstallLanguageIdentifier,Census_ProcessorManufacturerIdentifier,Census_GenuineStateName,...,Census_OSEdition,Census_OSSkuName,Census_OSUILocaleIdentifier,Census_OSInstallTypeName,LocaleEnglishNameIdentifier,GeoNameIdentifier,Census_OSVersion,Census_OSBuildRevision,CountryIdentifier,Census_SystemVolumeTotalCapacity
b17311335a5a2b3b709597a2bd3b0e13,0,no_info,0,3,142,137,2,5,INVALID_LICENSE,...,Professional,PROFESSIONAL,11,UUPUpgrade,34,39,10.0.17134.48,48,21,911
134b2d2ad43e5a2b8572447863033bcb,1,27,0,15,897,111,24,5,IS_GENUINE,...,Core,CORE,109,UUPUpgrade,55,24,10.0.16299.125,125,19,41037
a8225b2bd182f400f517e959b1221e99,1,no_info,0,1,554,117,8,5,IS_GENUINE,...,Core,CORE,31,Other,75,158,10.0.16299.371,371,139,51514
624c5478a184517f786dba122df164eb,1,27,0,3,628,137,29,5,IS_GENUINE,...,CoreSingleLanguage,CORE_SINGLELANGUAGE,125,Upgrade,182,211,10.0.17134.228,228,110,1735
b6cb201455818a249f05f1bc59240d79,0,27,1,12,142,137,7,5,IS_GENUINE,...,Core,CORE,30,Clean,74,276,10.0.17134.228,228,68,15682
cab997f1b99144e150926d60a2c08ee6,1,no_info,0,11,554,105,7,5,IS_GENUINE,...,Core,CORE,30,Other,74,224,10.0.15063.483,483,108,44104


#### We will review the level values in case we have to make some data processing:

In [28]:
# In Census_InternalBatteryType, we will put all levels that contains "li" together (lithium), 'n/a', #, Ã¿Ã¿Ã¿Ã¿ and unkn 
# together because they refer to missing values, keep nimh as a group (Nickel), the other one will keep as 'other'
levels(trainRed$Census_InternalBatteryType) <- c('unkn','lithm','lithm','unkn','other','unkn','unkn','other','lithm','other',
'unkn','other','other','other','other','other','other','other','lithm','other','lithm','lithm','lithm','lithm','lithm','lithm',
'lithm','lithm','lithm','lithm','lithm','lithm','other','unkn','nimh','other','other','other','other','other','unkn','other',
'other')

In [29]:
# In Census_PowerPlatformRoleName, we will put togheter UNKNOWN and Unspecified levels
levels(trainRed$Census_PowerPlatformRoleName) <- c('unkn','AppliancePC','Desktop','EnterpriseServer','Mobile',
                                                    'PerformanceServer','Slate','SOHOServer','unkn','unkn','Workstation')

In [30]:
# In Census_PrimaryDiskTypeName, we will put togheter UNKNOWN and Unspecified levels
levels(trainRed$Census_PrimaryDiskTypeName) <- c('unkn','HDD','SSD','unkn','unkn')

In [31]:
# In Census_ChassisTypeName, we will put togheter UNKNOWN and Unknown levels
levels(trainRed$Census_ChassisTypeName) <- c('unkn','0','127','30','31','35','36','88','AllinOne','Blade','BladeEnclosure',
'BusExpansionChassis','CompactPCI','Convertible','Desktop','Detachable','ExpansionChassis','HandHeld','Laptop',
'LowProfileDesktop','LunchBox','MainServerChassis','MiniPC','MiniTower','MultisystemChassis','Notebook','Other','PizzaBox',
'Portable','RackMountChassis','SealedCasePC','SpaceSaving','StickPC','SubChassis','SubNotebook','Tablet','Tower','unkn','unkn')

In [32]:
# In Census_ActivationChannel, we will change the name of the levels for proper manipulation
levels(trainRed$Census_ActivationChannel) <- c('OEM_DM','OEM_NONSLP','Retail','Retail_Eval','Volume_GVLK','Volume_MAK')

In [33]:
# In SmartScreen, we will put togheter some repeted variables
levels(trainRed$SmartScreen) <- c('unkn','x01','x02','x03','unkn','Block','ExistsNotSet','off','off','off','on','on',
                                   'Prompt','RequireAdmin','RequireAdmin','Warn','Warn')

In [34]:
str(trainRed)

'data.frame':	447000 obs. of  40 variables:
 $ MachineIdentifier                     : chr  "b17311335a5a2b3b709597a2bd3b0e13" "134b2d2ad43e5a2b8572447863033bcb" "a8225b2bd182f400f517e959b1221e99" "624c5478a184517f786dba122df164eb" ...
 $ HasDetections                         : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 1 1 ...
 $ OrganizationIdentifier                : Factor w/ 46 levels "1","10","11",..: 46 16 46 16 16 46 16 16 8 46 ...
 $ Wdft_IsGamer                          : Factor w/ 3 levels "0","1","no_info": 1 1 1 1 2 1 1 2 2 1 ...
 $ Wdft_RegionIdentifier                 : Factor w/ 16 levels "1","10","11",..: 9 7 1 9 4 3 7 9 1 7 ...
 $ Census_FirmwareManufacturerIdentifier : Factor w/ 306 levels "1005","1006",..: 39 289 167 194 39 167 39 39 39 39 ...
 $ IeVerIdentifier                       : Factor w/ 183 levels "1","10","102",..: 14 8 10 14 14 5 8 10 14 14 ...
 $ Census_OSInstallLanguageIdentifier    : Factor w/ 40 levels "1","10","11",..: 12 17 38 22 37 37 35 18 38 17 

#### Create Dummy Variables

In [35]:
# Getting variables to convert into dummy, Census_IsTouchEnabled and Census_IsSecureBootEnabled are dummy already
v_dummy<-list()
for (i in colnames(trainRed)){
    if (length(levels(trainRed[[i]]))>2)
     v_dummy=list.append(v_dummy,i)   
}
v_dummy <- unlist(v_dummy)
v_dummy

In [36]:
dummy<-data.frame(trainRed[c("MachineIdentifier","HasDetections")])
for (i in v_dummy){
    temp <- trainRed[,c('MachineIdentifier',i)]
    temp <- spread(temp,i,i,convert=TRUE)
    temp <- temp[, c(3:(ncol(temp)))]
    names <- colnames(temp)
    colnames(temp) <- paste(i, names, sep='_')
    temp[!is.na(temp)]<- 1
    temp[is.na(temp)]<- 0
    dummy <- cbind(dummy,temp)    
}

In [37]:
# Joining dummy dataset with the rest of variables
other<-trainRed[c('Census_IsTouchEnabled','Census_IsSecureBootEnabled','Census_SystemVolumeTotalCapacity')]
train_dummy <- cbind(dummy,other)

In [38]:
head(train_dummy)

MachineIdentifier,HasDetections,OrganizationIdentifier_10,OrganizationIdentifier_11,OrganizationIdentifier_12,OrganizationIdentifier_14,OrganizationIdentifier_15,OrganizationIdentifier_16,OrganizationIdentifier_18,OrganizationIdentifier_19,...,CountryIdentifier_216,CountryIdentifier_217,CountryIdentifier_218,CountryIdentifier_219,CountryIdentifier_220,CountryIdentifier_221,CountryIdentifier_222,Census_IsTouchEnabled,Census_IsSecureBootEnabled,Census_SystemVolumeTotalCapacity
b17311335a5a2b3b709597a2bd3b0e13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,911
134b2d2ad43e5a2b8572447863033bcb,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,41037
a8225b2bd182f400f517e959b1221e99,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,51514
624c5478a184517f786dba122df164eb,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1735
b6cb201455818a249f05f1bc59240d79,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,15682
cab997f1b99144e150926d60a2c08ee6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,44104


In [39]:
# Dummy variables as factor
dum <- which(!names(train_dummy) %in% c('MachineIdentifier', 'HasDetections','Census_SystemVolumeTotalCapacity'))
train_dummy[,dum]  <- lapply(train_dummy[,dum], as.factor)

In [40]:
# Dropping dummy variables with less than 1000 in level 1
drop<-c()
for (i in dum){
    if (sum(train_dummy[,i]==1)<1000)
    drop<-c(drop,i)
}

train_dummy<-train_dummy[,-drop]

### 2.3 Variable Selection

#### Split train data into train_fit, valid, test (60:20:20)

In [41]:
set.seed(123)

train_idx <- caret::createDataPartition(y=train_dummy[, 'HasDetections'], p=.6, list=F)
train_fit <- train_dummy[train_idx, ]  # Train 60%
valid_test <- train_dummy[-train_idx, ]  # Valid + Test 40%

valid_idx <- caret::createDataPartition(y=valid_test[, 'HasDetections'], p=.5, list=F)
valid <- valid_test[valid_idx, ]  # Valid 20%
test <- valid_test[-valid_idx, ]  # Test 20%

#### Check the target variable class distribution

In [42]:
# Train_fit
ddply(train_fit, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(train_fit), 2))

HasDetections,count,percentage
0,134355,0.5
1,133846,0.5


In [43]:
# Vaild
ddply(valid, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(valid), 2))

HasDetections,count,percentage
0,44785,0.5
1,44615,0.5


In [44]:
# Test
ddply(test, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(test), 2))

HasDetections,count,percentage
0,44784,0.5
1,44615,0.5


In [45]:
# Rename the data columns
for (v in colnames(train_fit)) {
    
    # Fix the column name
    fix_name <- str_replace_all(v, "[^[:alnum:] ]", "_")
    fix_name <- gsub(' +', '', fix_name) 
    
    # Train, valid,test
    colnames(train_fit)[colnames(train_fit) == v] <- fix_name
    colnames(valid)[colnames(valid) == v] <- fix_name
    colnames(test)[colnames(test) == v] <- fix_name
}

In [46]:
# Convert variables as numeric in training dataset to calculate correlation
no_convert <- c('MachineIdentifier','Census_SystemVolumeTotalCapacity')
for (i in names(train_fit)){
    if (!(i %in% no_convert)){
        train_fit[i]<-as.numeric(levels(train_fit[[i]])[train_fit[[i]]])
    }
  
}

In [47]:
# Looking for correlated features
no_cor<-c(1,2)
cor <- findCorrelation(train_fit[,-no_cor], cutoff=0.75, names = TRUE)

In [48]:
# Removing redundant reatures
keep <- setdiff(colnames(train_fit),cor)
train_fit<-train_fit[,keep]

#### FisherScore

In [49]:
FisherScore <- function(basetable, depvar, IV_list) {
  "
  This function calculate the Fisher score of a variable.
  
  Ref:
  ---
  Verbeke, W., Dejaeger, K., Martens, D., Hur, J., & Baesens, B. (2012). New insights into churn prediction in the telecommunication sector: A profit driven data mining approach. European Journal of Operational Research, 218(1), 211-229.
  "
  
  # Get the unique values of dependent variable
  DV <- unique(basetable[, depvar])
  
  IV_FisherScore <- c()
  
  for (v in IV_list) {
    fs <- abs((mean(basetable[which(basetable[, depvar]==DV[1]), v]) - mean(basetable[which(basetable[, depvar]==DV[2]), v]))) /
      sqrt((var(basetable[which(basetable[, depvar]==DV[1]), v]) + var(basetable[which(basetable[, depvar]==DV[2]), v])))
    IV_FisherScore <- c(IV_FisherScore, fs)
  }
  
  return(data.frame(IV=IV_list, fisher_score=IV_FisherScore))
}

varSelectionFisher <- function(basetable, depvar, IV_list, num_select) {
  "
  This function will calculate the Fisher score for all IVs and select the best
  top IVs.

  Assumption: all variables of input dataset are converted into numeric type.
  "
  
  fs <- FisherScore(basetable, depvar, IV_list)  # Calculate Fisher Score for all IVs
  num_select <- min(num_select, ncol(basetable))  # Top N IVs to be selected
  return(as.vector(fs[order(fs$fisher_score, decreasing=T), ][1:num_select, 'IV']))
}

In [50]:
# Calculate Fisher Score for all variable
# Get the IV and DV list
dv_list <- c('HasDetections')  # DV list
iv_list <- setdiff(names(train_fit), dv_list)  # IV list excluded DV
iv_list <- setdiff(iv_list, 'MachineIdentifier')  # Excluded the MachineIdentifier

fs <- FisherScore(train_fit, dv_list, iv_list)
fs <- fs[order(-fs$fisher_score),]
head(fs)

Unnamed: 0,IV,fisher_score
13,IeVerIdentifier_114,0.007216679
92,LocaleEnglishNameIdentifier_262,0.007209489
111,GeoNameIdentifier_29,0.006595798
132,Census_OSVersion_10_0_15063_540,0.006568268
146,Census_OSBuildRevision_540,0.006568268
90,LocaleEnglishNameIdentifier_246,0.006225686


In [51]:
# Select top 50 variables according to the Fisher Score
best_fs_var <- varSelectionFisher(train_fit, dv_list, iv_list, num_select=50)
head(best_fs_var, 10)

In [52]:
# Apply variable selection to the data
# Train
var_select <- names(train_fit)[names(train_fit) %in% best_fs_var]
train_sel <- train_fit[, c('MachineIdentifier', var_select, 'HasDetections')]
# Valid
var_select <- names(valid)[names(valid) %in% best_fs_var]
valid_sel <- valid[, c('MachineIdentifier', var_select, 'HasDetections')]
# Test
var_select <- names(test)[names(test) %in% best_fs_var]
test_sel <- test[, c('MachineIdentifier', var_select, 'HasDetections')]

In [53]:
# Target Variable as factor for training the models
train_sel$HasDetections<- as.factor(train_sel$HasDetections)

# Dummy variables as factor
dum <- which(!names(train_sel) %in% c('MachineIdentifier', 'HasDetections','Census_SystemVolumeTotalCapacity'))
train_sel[,dum]  <- lapply(train_sel[,dum], as.factor)

### 3. Methodology

### 3.1 Logistic Regression

In [4]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=5, predict="both")

# Define the model
lg_lrn <- makeLearner("classif.logreg", predict.type="prob")

# Define the task
lg_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set hyper parameter tuning
tune_params <- makeParamSet(
    
           makeLogicalLearnerParam("model", default = TRUE, tunable = TRUE)
)

ctrl = makeTuneControlGrid(resolution = 10L)

parallelStartSocket(cpus = detectCores())
 
lgPars <- tuneParams(lg_lrn, task = lg_task,
                     resampling = rdesc,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

Starting parallelization in mode=socket with cpus=8.
[Tune] Started tuning learner classif.logreg for parameter set:
         Type len  Def Constr Req Tunable Trafo
model logical   - TRUE      -   -    TRUE     -
With control class: TuneControlGrid
Imputation value: 1
Exporting objects to slaves for mode socket: .mlr.slave.options
Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 2.
[Tune] Result: model=FALSE : mmce.test.mean=0.4865754
Stopped parallelization. All cleaned up.


In [5]:
# check the tuned parameters
lgPars$x

In [6]:
# set learner with tuned parameters
tunedLG <- setHyperPars(lg_lrn, par.vals = lgPars$x)
# retain the model 
lgModel <- mlr::train(tunedLG, lg_task)

In [7]:
# Make prediction on valid data
pred <- predict(lgModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

"prediction from a rank-deficient fit may be misleading"

In [8]:
# Make prediction on valid data
pred <- predict(lgModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)

"prediction from a rank-deficient fit may be misleading"

### 3.2 Random Forest

In [4]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=5)

# Define the model
rf_lrn <- makeLearner("classif.randomForest", predict.type="prob")

# Define the task
rf_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set hyper parameter tuning
tune_params <- makeParamSet(
  makeIntegerParam("ntree",lower = 50, upper = 150),
  makeIntegerParam("mtry", lower = 1, upper = 5),
  makeIntegerParam("nodesize", lower = 10, upper = 50)
)
ctrl = makeTuneControlRandom(maxit=20L)

parallelStartSocket(cpus = detectCores())
 
rfPars <- tuneParams(rf_lrn, task = rf_task,
                     resampling = rdesc,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

Starting parallelization in mode=socket with cpus=8.
[Tune] Started tuning learner classif.randomForest for parameter set:
            Type len Def    Constr Req Tunable Trafo
ntree    integer   -   - 50 to 150   -    TRUE     -
mtry     integer   -   -    1 to 5   -    TRUE     -
nodesize integer   -   -  10 to 50   -    TRUE     -
With control class: TuneControlRandom
Imputation value: 1
Exporting objects to slaves for mode socket: .mlr.slave.options
Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 20.
[Tune] Result: ntree=57; mtry=5; nodesize=25 : mmce.test.mean=0.4781563
Stopped parallelization. All cleaned up.


In [5]:
# check the tuned parameters
rfPars$x

In [6]:
# set learner with tuned parameters
tunedRF <- setHyperPars(rf_lrn, par.vals = rfPars$x)
# retain the model 
rfModel <- mlr::train(tunedRF, rf_task)

In [7]:
# Make prediction on valid data
pred <- predict(rfModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

In [8]:
# Make prediction on test data
pred <- predict(rfModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)

### 3.3 XGBoost (Extreme Gradient Boosting)

In [9]:
# Define the model
gb_lrn <- makeLearner("classif.xgboost", predict.type="prob")

# Define the task
gb_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set up cross-validation
rdesc <- makeResampleDesc("CV", iters=10, predict="both")
cv_inst <- makeResampleInstance(rdesc, task = gb_task)

# Set hyper parameter tuning
tune_params <- makeParamSet(
   makeIntegerParam("nrounds", lower = 100, upper = 1000),
  makeIntegerParam("max_depth", lower = 1, upper = 15),
  makeNumericParam("eta", lower = .001, upper = .5),
  makeNumericParam("lambda", lower = -1, upper = 3, trafo = function(x) 10^x))
                   
# set tune control                   
mbo.ctrl <- makeMBOControl()
mbo.ctrl <- setMBOControlTermination(mbo.ctrl, iters = 50)
ctrl <- mlr:::makeTuneControlMBO(mbo.control = mbo.ctrl)

parallelStartSocket(cpus = detectCores())
 
xgbstPars <- tuneParams(gb_lrn, task = gb_task,
                     resampling = cv_inst,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

Starting parallelization in mode=socket with cpus=8.
[Tune] Started tuning learner classif.xgboost for parameter set:
             Type len Def       Constr Req Tunable Trafo
nrounds   integer   -   - 100 to 1e+03   -    TRUE     -
max_depth integer   -   -      1 to 15   -    TRUE     -
eta       numeric   -   - 0.001 to 0.5   -    TRUE     -
lambda    numeric   -   -      -1 to 3   -    TRUE     Y
With control class: TuneControlMBO
Imputation value: 1
Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; elements = 16.
"Stopped because hard maximum generation limit was hit."Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; elements = 1.
Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; elements = 1.
Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; elements = 1.
Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; elements = 1.
Mapping in parallel: mode = socket; level = mlrMBO.feval; cpus = 8; element

In [10]:
# check the tuned parameters
xgbstPars$x

In [11]:
# set learner with tuned parameters
tunedXGB <- setHyperPars(gb_lrn, par.vals = xgbstPars$x)
# retain the model 
xgbModel <- mlr::train(tunedXGB, gb_task)

In [12]:
# Make prediction on valid data
pred <- predict(xgbModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

In [13]:
# Make prediction on test data
pred <- predict(xgbModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)