# Logistic Regression

### Problem Statement :- 
        - Whether a candidate can win an election or not

### Data Cleaning & Processing

Loading the dataset

In [21]:
election <- read.csv("~/desktop/Digi 360/Module 9/election_data.csv")

In [22]:
head(election)

Election.id,Result,Year,Amount.Spent,Popularity.Rank
,,,,
122.0,0.0,32.0,3.81,3.0
315.0,1.0,48.0,6.32,2.0
201.0,1.0,51.0,3.67,1.0
965.0,0.0,40.0,2.93,4.0
410.0,1.0,52.0,3.6,1.0


In [24]:
## Finding the null values
sum(is.na(election))

In [25]:
## Let's remove the first row since it has NA values

election <- election[-1,]

In [26]:
head(election)

Unnamed: 0,Election.id,Result,Year,Amount.Spent,Popularity.Rank
2,122,0,32,3.81,3
3,315,1,48,6.32,2
4,201,1,51,3.67,1
5,965,0,40,2.93,4
6,410,1,52,3.6,1
7,150,0,35,4.2,4


In [27]:
## Let's remove election ID since it is not useful for building the model

election <- election[,-1]
head(election)

Unnamed: 0,Result,Year,Amount.Spent,Popularity.Rank
2,0,32,3.81,3
3,1,48,6.32,2
4,1,51,3.67,1
5,0,40,2.93,4
6,1,52,3.6,1
7,0,35,4.2,4


In [28]:
#Let's see the data structure
str(election)

'data.frame':	10 obs. of  4 variables:
 $ Result         : int  0 1 1 0 1 0 1 1 1 0
 $ Year           : int  32 48 51 40 52 35 39 42 44 50
 $ Amount.Spent   : num  3.81 6.32 3.67 2.93 3.6 4.2 5.66 4.32 3.26 4.52
 $ Popularity.Rank: int  3 2 1 4 1 4 2 3 3 4


In [29]:
#Let's see the summary stats of the data frame
summary(election)

     Result         Year        Amount.Spent   Popularity.Rank
 Min.   :0.0   Min.   :32.00   Min.   :2.930   Min.   :1.00   
 1st Qu.:0.0   1st Qu.:39.25   1st Qu.:3.618   1st Qu.:2.00   
 Median :1.0   Median :43.00   Median :4.005   Median :3.00   
 Mean   :0.6   Mean   :43.30   Mean   :4.229   Mean   :2.70   
 3rd Qu.:1.0   3rd Qu.:49.50   3rd Qu.:4.470   3rd Qu.:3.75   
 Max.   :1.0   Max.   :52.00   Max.   :6.320   Max.   :4.00   

In [31]:
## Lets check if any outliers present by checking the percentiles till 99.5%
quants <- c(0,0.25,0.50,0.75,0.90,0.95,0.99,0.995,1)
apply( election[1:4] , 2 , quantile , probs = quants , na.rm = TRUE )

Unnamed: 0,Result,Year,Amount.Spent,Popularity.Rank
0%,0,32.0,2.93,1.0
25%,0,39.25,3.6175,2.0
50%,1,43.0,4.005,3.0
75%,1,49.5,4.47,3.75
90%,1,51.1,5.726,4.0
95%,1,51.55,6.023,4.0
99%,1,51.91,6.2606,4.0
99.5%,1,51.955,6.2903,4.0
100%,1,52.0,6.32,4.0


From above summary we cleary see that there are no outliers.

### Checking collinearity

In [32]:
round(cor(election),2)

Unnamed: 0,Result,Year,Amount.Spent,Popularity.Rank
Result,1.0,0.5,0.3,-0.78
Year,0.5,1.0,0.07,-0.53
Amount.Spent,0.3,0.07,1.0,-0.2
Popularity.Rank,-0.78,-0.53,-0.2,1.0


### Scaling - Normalizing

In [33]:
library('caret')

Loading required package: lattice
Loading required package: ggplot2


### Train and Test Split

In [34]:
library('caTools') 
set.seed(123) 
split = sample.split(election, SplitRatio = 0.7) 
train = subset(election, split == TRUE)
test = subset(election, split == FALSE) 

In [35]:
head(train)

Unnamed: 0,Result,Year,Amount.Spent,Popularity.Rank
2,0,32,3.81,3
4,1,51,3.67,1
6,1,52,3.6,1
8,1,39,5.66,2
10,1,44,3.26,3


In [36]:
library('caret')
preproc = preProcess(train, method = c("range"))

train_norm <- predict(preproc, train)
 
summary(train_norm)

     Result         Year       Amount.Spent    Popularity.Rank
 Min.   :0.0   Min.   :0.00   Min.   :0.0000   Min.   :0.0    
 1st Qu.:1.0   1st Qu.:0.35   1st Qu.:0.1417   1st Qu.:0.0    
 Median :1.0   Median :0.60   Median :0.1708   Median :0.5    
 Mean   :0.8   Mean   :0.58   Mean   :0.3083   Mean   :0.5    
 3rd Qu.:1.0   3rd Qu.:0.95   3rd Qu.:0.2292   3rd Qu.:1.0    
 Max.   :1.0   Max.   :1.00   Max.   :1.0000   Max.   :1.0    

### Building the First Training Model

In [45]:
logm1 <- glm(Result~.,data=train_norm, family='binomial')
summary(logm1)


Call:
glm(formula = Result ~ ., family = "binomial", data = train_norm)

Deviance Residuals: 
         2           4           6           8          10  
-6.786e-06   9.338e-06   1.534e-06   6.503e-06   6.807e-06  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)
(Intercept)        -78.60  375921.91       0        1
Year                99.46  347245.88       0        1
Amount.Spent        46.66  223171.57       0        1
Popularity.Rank     43.42  266862.76       0        1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5.0040e+00  on 4  degrees of freedom
Residual deviance: 2.2422e-10  on 1  degrees of freedom
AIC: 8

Number of Fisher Scoring iterations: 23


Here we can see p value is high for all three features. Let's see VIF for them.

### Checking VIF

In [39]:
library(fmsb)

Registered S3 methods overwritten by 'fmsb':
  method    from
  print.roc pROC
  plot.roc  pROC


In [40]:
vif_func<-function(in_frame,thresh=10,trace=T,...){
  
  require(fmsb)
  
  if(class(in_frame) != 'data.frame') in_frame<-data.frame(in_frame)
  
  #get initial vif value for all comparisons of variables
  vif_init<-NULL
  var_names <- names(in_frame)
  for(val in var_names){
    regressors <- var_names[-which(var_names == val)]
    form <- paste(regressors, collapse = '+')
    form_in <- formula(paste(val, '~', form))
    vif_init<-rbind(vif_init, c(val, VIF(lm(form_in, data = in_frame, ...))))
  }
  vif_max<-max(as.numeric(vif_init[,2]), na.rm = TRUE)
  
  if(vif_max < thresh){
    if(trace==T){ #print output of each iteration
      prmatrix(vif_init,collab=c('var','vif'),rowlab=rep('',nrow(vif_init)),quote=F)
      cat('\n')
      cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n')
    }
    return(var_names)
  }
  else{
    
    in_dat<-in_frame
    
    #backwards selection of explanatory variables, stops when all VIF values are below 'thresh'
    while(vif_max >= thresh){
      
      vif_vals<-NULL
      var_names <- names(in_dat)
      
      for(val in var_names){
        regressors <- var_names[-which(var_names == val)]
        form <- paste(regressors, collapse = '+')
        form_in <- formula(paste(val, '~', form))
        vif_add<-VIF(lm(form_in, data = in_dat, ...))
        vif_vals<-rbind(vif_vals,c(val,vif_add))
      }
      max_row<-which(vif_vals[,2] == max(as.numeric(vif_vals[,2]), na.rm = TRUE))[1]
      
      vif_max<-as.numeric(vif_vals[max_row,2])
      
      if(vif_max<thresh) break
      
      if(trace==T){ #print output of each iteration
        prmatrix(vif_vals,collab=c('var','vif'),rowlab=rep('',nrow(vif_vals)),quote=F)
        cat('\n')
        cat('removed: ',vif_vals[max_row,1],vif_max,'\n\n')
        flush.console()
      }
      
      in_dat<-in_dat[,!names(in_dat) %in% vif_vals[max_row,1]]
      
    }
    
    return(names(in_dat))
    
  }
  
}

In [41]:
col<- vif_func(in_frame=train_norm,thresh=5,trace=T)

 var             vif             
 Result          297.105453124988
 Year            1076.1447109376 
 Amount.Spent    217.760507812496
 Popularity.Rank 310.259765625   

removed:  Year 1076.145 



### Conclusion

Here p vaue is insignificant for all three features and also VIF is too high among them. So, the model we build is not good. The reason may be the dataset that we are using is too small. If we have better dataset with more features, we may build good model.  