# Group project: Microsoft Malware Prediction

####  Group 10:  Alejandra Zambrano, Chenxin Xie, Manoj Kumar Purushothaman

### 1. Load libraries

In [None]:
Sys.setenv(LANG = "en")

# Data processing library

library(data.table)       # Data manipulation
library(plyr)             # Data manipulation
library(dplyr)            # Data manipulation
library(tidyr)            # Data manipulation
library(stringr)          # String, text processing
library(vita)             # Quickly check variable importance
library(dataPreparation)  # Data preparation library
library(rlist)            # Data manipulation
library(regclass)
library(tibble)
library(parallel)
library(mlrMBO)

# Machine learning library
library(mlr)           # Machine learning framework
library(caret)         # Data processing and machine learning framework
library(MASS)          # LDA
library(randomForest)  # RF
library(gbm)           # Boosting Tree
library(xgboost)       # XGboost


In [None]:
#install.packages("regclass")

In [None]:
memory.limit(size=1000000000)

### 2. Data summary and Processing

### 2.1 Data summary

#### Read in data
#### Here we read a subset (20%) of the full dataset and working only with 447.000 observations (almost 5% of the dataset in Kaggle)

In [None]:
# Read data
train_full <- read.csv('./sub_train.csv')

In [None]:
set.seed(123)
train_ind <- sample(seq_len(nrow(train_full)), size = 447000)

In [None]:
train <- train_full[train_ind,]

In [None]:
train <- remove_rownames(train)

In [None]:
head(train)

In [None]:
dim(train)

In [None]:
str(train)

In [None]:
# We can observe that the target variable is balanced
table(train$HasDetections)

### 2.2 Preprocessing data

#### Check missing values

In [None]:
stats<-c("unique_values","perc_missing_values", "perc_biggest_category", "type")
df<-data.frame(matrix(ncol=length(stats), nrow=length(colnames(train))))
names(df)<-stats
rownames(df) <- colnames(train)
df["type"] <- sapply(train, class)
for (i in colnames(train)){
    df[i,1] <- length(unique(train[[i]]))
    df[i,2] <- round(sum(is.na(train[i]))/nrow(train),2)
    df[i,3] <- round(max(table(train[i]))/nrow(train),2)

}                          

In [None]:
df <- df[order(-df["perc_missing_values"],-df["perc_biggest_category"]),]

In [None]:
df

The variable DefaultBrowsersIdentifier have +90% missing values, which means that these column is useless and should be dropped. Also, other variables have +50% missing values, we will remove them.We can observe in some variables that one category contains more than +90% of the total values, we will remove these imbalanced columns, take into the account that our target variable is balanced.

In [None]:
newdf <- df[(df["perc_biggest_category"]<=0.9)&(df["perc_missing_values"]<=0.5),]

In [None]:
variables<-rownames(newdf)

In [None]:
train_red<-train[,variables]

Most of the variables are type numeric or integers however, looking at their unique values it seems that they could be treated as categorical variables. The variables with missing values will be filling as "no_info"

In [None]:
train_red[is.na(train_red)] <- 'no_info'

In [None]:
head(train_red)

In [None]:
# Converting as factor all variables except MachineIdentifier
train_red["MachineIdentifier"]<-as.character(train_red$MachineIdentifier)

In [None]:
for (i in colnames(train_red)){
    if (i!= "MachineIdentifier"){
        train_red[i] <- as.factor(train_red[[i]])
    }
    
}

In [None]:
str(train_red)

We will exclude the categorical variables with more than 400 levels. Computing dummy variables in these variables or performance a grouped process is computer expensive and R does not support it. The variable Census_SystemVolumeTotalCapacity could be convert into numerical, it has a lot of levels and there is not missing values.

In [None]:
v_list<-list()
for (i in colnames(train_red)){
    if (length(levels(train_red[[i]])) <400)
     v_list=list.append(v_list,i)   
}
v_list<-unlist(v_list)

In [None]:
# Get the IV and DV list name
# Dependent variable (DV)
dv_list <- c('HasDetections')
# Independent variable (IV)
iv_list <- setdiff(v_list, dv_list)  # Exclude the target variable
iv_list <- setdiff(iv_list, 'MachineIdentifier')  # Exclude the MachineIdentifier

In [None]:
trainRed <- train_red[,c('MachineIdentifier','HasDetections',iv_list)]

In [None]:
trainRed$Census_SystemVolumeTotalCapacity <- as.numeric(train_red$Census_SystemVolumeTotalCapacity)

In [None]:
head(trainRed)

We will review the level values in case we have to make some data processing:

In [None]:
# In Census_InternalBatteryType, we will put all levels that contains "li" together (lithium), 'n/a', #, Ã¿Ã¿Ã¿Ã¿ and unkn 
# together because they refer to missing values, keep nimh as a group (Nickel), the other one will keep as 'other'
levels(trainRed$Census_InternalBatteryType) <- c('unkn','lithm','lithm','unkn','other','unkn','unkn','other','lithm','other',
'unkn','other','other','other','other','other','other','other','lithm','other','lithm','lithm','lithm','lithm','lithm','lithm',
'lithm','lithm','lithm','lithm','lithm','lithm','other','unkn','nimh','other','other','other','other','other','unkn','other',
'other')

In [None]:
# In Census_PowerPlatformRoleName, we will put togheter UNKNOWN and Unspecified levels
levels(trainRed$Census_PowerPlatformRoleName) <- c('unkn','AppliancePC','Desktop','EnterpriseServer','Mobile',
                                                    'PerformanceServer','Slate','SOHOServer','unkn','unkn','Workstation')

In [None]:
# In Census_PrimaryDiskTypeName, we will put togheter UNKNOWN and Unspecified levels
levels(trainRed$Census_PrimaryDiskTypeName) <- c('unkn','HDD','SSD','unkn','unkn')

In [None]:
# In Census_ChassisTypeName, we will put togheter UNKNOWN and Unknown levels
levels(trainRed$Census_ChassisTypeName) <- c('unkn','0','127','30','31','35','36','88','AllinOne','Blade','BladeEnclosure',
'BusExpansionChassis','CompactPCI','Convertible','Desktop','Detachable','ExpansionChassis','HandHeld','Laptop',
'LowProfileDesktop','LunchBox','MainServerChassis','MiniPC','MiniTower','MultisystemChassis','Notebook','Other','PizzaBox',
'Portable','RackMountChassis','SealedCasePC','SpaceSaving','StickPC','SubChassis','SubNotebook','Tablet','Tower','unkn','unkn')

In [None]:
# In Census_ActivationChannel, we will change the name of the levels for proper manipulation
levels(trainRed$Census_ActivationChannel) <- c('OEM_DM','OEM_NONSLP','Retail','Retail_Eval','Volume_GVLK','Volume_MAK')

In [None]:
# In SmartScreen, we will put togheter some repeted variables
levels(trainRed$SmartScreen) <- c('unkn','x01','x02','x03','unkn','Block','ExistsNotSet','off','off','off','on','on',
                                   'Prompt','RequireAdmin','RequireAdmin','Warn','Warn')

In [None]:
str(trainRed)

#### Create Dummy Variables

In [None]:
# Getting variables to convert into dummy, Census_IsTouchEnabled and Census_IsSecureBootEnabled are dummy already
v_dummy<-list()
for (i in colnames(trainRed)){
    if (length(levels(trainRed[[i]]))>2)
     v_dummy=list.append(v_dummy,i)   
}
v_dummy <- unlist(v_dummy)
v_dummy

In [None]:
dummy<-data.frame(trainRed[c("MachineIdentifier","HasDetections")])
for (i in v_dummy){
    temp <- trainRed[,c('MachineIdentifier',i)]
    temp <- spread(temp,i,i,convert=TRUE)
    temp <- temp[, c(3:(ncol(temp)))]
    names <- colnames(temp)
    colnames(temp) <- paste(i, names, sep='_')
    temp[!is.na(temp)]<- 1
    temp[is.na(temp)]<- 0
    dummy <- cbind(dummy,temp)    
}

In [None]:
# Joining dummy dataset with the rest of variables
other<-trainRed[c('Census_IsTouchEnabled','Census_IsSecureBootEnabled','Census_SystemVolumeTotalCapacity')]
train_dummy <- cbind(dummy,other)

In [None]:
head(train_dummy)

In [None]:
# Dummy variables as factor
dum <- which(!names(train_dummy) %in% c('MachineIdentifier', 'HasDetections','Census_SystemVolumeTotalCapacity'))
train_dummy[,dum]  <- lapply(train_dummy[,dum], as.factor)

In [None]:
# Dropping dummy variables with less than 1000 in level 1
drop<-c()
for (i in dum){
    if (sum(train_dummy[,i]==1)<1000)
    drop<-c(drop,i)
}


In [None]:
train_dummy<-train_dummy[,-drop]

### 2.3 Variable Selection

#### Split train data into train_fit, valid, test (60:20:20)

In [None]:
set.seed(123)

train_idx <- caret::createDataPartition(y=train_dummy[, 'HasDetections'], p=.6, list=F)
train_fit <- train_dummy[train_idx, ]  # Train 60%
valid_test <- train_dummy[-train_idx, ]  # Valid + Test 40%

valid_idx <- caret::createDataPartition(y=valid_test[, 'HasDetections'], p=.5, list=F)
valid <- valid_test[valid_idx, ]  # Valid 20%
test <- valid_test[-valid_idx, ]  # Test 20%

#### Check the target variable class distribution

In [None]:
# Train_fit
ddply(train_fit, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(train_fit), 2))

In [None]:
# Vaild
ddply(valid, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(valid), 2))

In [None]:
# Test
ddply(test, "HasDetections", summarise, count = length(HasDetections), 
    percentage = round(length(HasDetections)/nrow(test), 2))

In [None]:
# Rename the data columns
for (v in colnames(train_fit)) {
    
    # Fix the column name
    fix_name <- str_replace_all(v, "[^[:alnum:] ]", "_")
    fix_name <- gsub(' +', '', fix_name) 
    
    # Train, valid,test
    colnames(train_fit)[colnames(train_fit) == v] <- fix_name
    colnames(valid)[colnames(valid) == v] <- fix_name
    colnames(test)[colnames(test) == v] <- fix_name
}

In [None]:
# Convert variables as numeric in training dataset to calculate correlation
no_convert <- c('MachineIdentifier','Census_SystemVolumeTotalCapacity')
for (i in names(train_fit)){
    if (!(i %in% no_convert)){
        train_fit[i]<-as.numeric(levels(train_fit[[i]])[train_fit[[i]]])
    }
  
}

In [None]:
# Looking for correlated features
no_cor<-c(1,2)
cor <- findCorrelation(train_fit[,-no_cor], cutoff=0.75, names = TRUE)

In [None]:
# Removing redundant reatures
keep <- setdiff(colnames(train_fit),cor)
train_fit<-train_fit[,keep]

### FisherScore

In [None]:
FisherScore <- function(basetable, depvar, IV_list) {
  "
  This function calculate the Fisher score of a variable.
  
  Ref:
  ---
  Verbeke, W., Dejaeger, K., Martens, D., Hur, J., & Baesens, B. (2012). New insights into churn prediction in the telecommunication sector: A profit driven data mining approach. European Journal of Operational Research, 218(1), 211-229.
  "
  
  # Get the unique values of dependent variable
  DV <- unique(basetable[, depvar])
  
  IV_FisherScore <- c()
  
  for (v in IV_list) {
    fs <- abs((mean(basetable[which(basetable[, depvar]==DV[1]), v]) - mean(basetable[which(basetable[, depvar]==DV[2]), v]))) /
      sqrt((var(basetable[which(basetable[, depvar]==DV[1]), v]) + var(basetable[which(basetable[, depvar]==DV[2]), v])))
    IV_FisherScore <- c(IV_FisherScore, fs)
  }
  
  return(data.frame(IV=IV_list, fisher_score=IV_FisherScore))
}

varSelectionFisher <- function(basetable, depvar, IV_list, num_select) {
  "
  This function will calculate the Fisher score for all IVs and select the best
  top IVs.

  Assumption: all variables of input dataset are converted into numeric type.
  "
  
  fs <- FisherScore(basetable, depvar, IV_list)  # Calculate Fisher Score for all IVs
  num_select <- min(num_select, ncol(basetable))  # Top N IVs to be selected
  return(as.vector(fs[order(fs$fisher_score, decreasing=T), ][1:num_select, 'IV']))
}

In [None]:
# Calculate Fisher Score for all variable
# Get the IV and DV list
dv_list <- c('HasDetections')  # DV list
iv_list <- setdiff(names(train_fit), dv_list)  # IV list excluded DV
iv_list <- setdiff(iv_list, 'MachineIdentifier')  # Excluded the MachineIdentifier

fs <- FisherScore(train_fit, dv_list, iv_list)
fs <- fs[order(-fs$fisher_score),]
head(fs)

In [None]:
# Select top 50 variables according to the Fisher Score
best_fs_var <- varSelectionFisher(train_fit, dv_list, iv_list, num_select=50)
head(best_fs_var, 10)

In [None]:
# Apply variable selection to the data
# Train
var_select <- names(train_fit)[names(train_fit) %in% best_fs_var]
train_sel <- train_fit[, c('MachineIdentifier', var_select, 'HasDetections')]
# Valid
var_select <- names(valid)[names(valid) %in% best_fs_var]
valid_sel <- valid[, c('MachineIdentifier', var_select, 'HasDetections')]
# Test
var_select <- names(test)[names(test) %in% best_fs_var]
test_sel <- test[, c('MachineIdentifier', var_select, 'HasDetections')]

In [None]:
# Target Variable as factor for training the models
train_sel$HasDetections<- as.factor(train_sel$HasDetections)

# Dummy variables as factor
dum <- which(!names(train_sel) %in% c('MachineIdentifier', 'HasDetections','Census_SystemVolumeTotalCapacity'))
train_sel[,dum]  <- lapply(train_sel[,dum], as.factor)

### 3. Methodology

### 3.1 Logistic Regresion

In [None]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=5, predict="both")

# Define the model
lg_lrn <- makeLearner("classif.logreg", predict.type="prob")

# Define the task
lg_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set hyper parameter tuning
tune_params <- makeParamSet(
    
           makeLogicalLearnerParam("model", default = TRUE, tunable = TRUE)
)

ctrl = makeTuneControlGrid(resolution = 10L)

parallelStartSocket(cpus = detectCores())
 
lgPars <- tuneParams(lg_lrn, task = lg_task,
                     resampling = rdesc,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

In [None]:
# Set learner with tuned parameters
tunedLG <- setHyperPars(lg_lrn, par.vals = lgPars$x)
# Retain the model 
lgModel <- mlr::train(tunedLG, lg_task)

In [None]:
# Make prediction on valid data
pred <- predict(lgModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

In [None]:
# Make prediction on test data
pred <- predict(lgModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)

### 3.2 Random Forest

In [None]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=5)

# Define the model
rf_lrn <- makeLearner("classif.randomForest", predict.type="prob")

# Define the task
rf_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set hyper parameter tuning
tune_params <- makeParamSet(
  makeIntegerParam("ntree",lower = 50, upper = 150),
  makeIntegerParam("mtry", lower = 1, upper = 5),
  makeIntegerParam("nodesize", lower = 10, upper = 50)
)
ctrl = makeTuneControlRandom(maxit=20L)

parallelStartSocket(cpus = detectCores())
 
rfPars <- tuneParams(rf_lrn, task = rf_task,
                     resampling = rdesc,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

In [None]:
# Set learner with tuned parameters
tunedRF <- setHyperPars(rf_lrn, par.vals = rfPars$x)
# Retain the model 
rfModel <- mlr::train(tunedRF, rf_task)

In [None]:
# Make prediction on valid data
pred <- predict(rfModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

In [None]:
# Make prediction on test data
pred <- predict(rfModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)

### 3.3 XGBoost (Extreme Gradient Boosting)

In [None]:
# Define the model
gb_lrn <- makeLearner("classif.xgboost", predict.type="prob")

# Define the task
gb_task <- makeClassifTask(id="maleware_train", data=train_sel[, -1], target="HasDetections")

# Set up cross-validation
rdesc <- makeResampleDesc("CV", iters=10, predict="both")
cv_inst <- makeResampleInstance(rdesc, task = gb_task)

# Set hyper parameter tuning
tune_params <- makeParamSet(
   makeIntegerParam("nrounds", lower = 100, upper = 1000),
  makeIntegerParam("max_depth", lower = 1, upper = 15),
  makeNumericParam("eta", lower = .001, upper = .5),
  makeNumericParam("lambda", lower = -1, upper = 3, trafo = function(x) 10^x))
                   
# set tune control                   
mbo.ctrl <- makeMBOControl()
mbo.ctrl <- setMBOControlTermination(mbo.ctrl, iters = 50)
ctrl <- mlr:::makeTuneControlMBO(mbo.control = mbo.ctrl)

parallelStartSocket(cpus = detectCores())
 
xgbstPars <- tuneParams(gb_lrn, task = gb_task,
                     resampling = cv_inst,
                     par.set = tune_params,
                     control = ctrl)
 
parallelStop()

In [None]:
# Set learner with tuned parameters
tunedXGB <- setHyperPars(gb_lrn, par.vals = xgbstPars$x)
# retain the model 
xgbModel <- mlr::train(tunedXGB, gb_task)

In [None]:
# Make prediction on valid data
pred <- predict(xgbModel, newdata=valid_sel[, -1])
performance(pred, measures=mlr::auc)

In [None]:
# Make prediction on test data
pred <- predict(xgbModel, newdata=test_sel[, -1])
performance(pred, measures=mlr::auc)