# Clinton-Obama Case Study - 
<div class="alert alert-block alert-warning">
The following code is using the Obama.csv voting data until February 19th 2008.

The following few lines re-use some lines from the starter code

In [None]:
.libPaths("/usr/local/lib/R/site-library")  # only needed for our Jupyter server

library(ellipse)
library(RColorBrewer)
library(PerformanceAnalytics)
library(RCurl) 
library(rpart)
library(rpart.plot)
library(cvTools)
library(glmnet)
library(forecast) 
library(Metrics)
library(corrplot)

In [None]:
# If the datafile is stored locally in the current working directory simply use this code
elect.df  <- read.csv('Obama.csv')

In [None]:
summary(elect.df)

## Inspecting the data  - 
<div class="alert alert-block alert-warning">



# Cleaning our data
## Impute missing ethnic data as zero values

In [None]:
# First write a function that replaces all NAs in a data vector (vec) with some predefined value (replaceValue)
# This function can be called for different data columns in a dataset to impute data.

ImputeData <- function(vec, replaceValue) {
  ifelse(is.na(vec), replaceValue, vec)
}

In [None]:
# Now apply this to the attributes where we decide to replace NAs with zeros
# These are: Black, Asian, AmericanIndian, FarmArea

for(attribute in c("Black", "Asian", "AmericanIndian", "FarmArea")) {
  elect.df[, attribute] <- ImputeData(elect.df[ , attribute], 0)
}

## Impute other missing data (columns 10 to 41) by their mean values

<div class="alert alert-block alert-warning">
The following code is used to replace missing values with the mean values for all the remaining attribute columns from column 10 onwards. (The attributes before this are not appropriate to be treated in this way.)

In [None]:
# Find the means for all the numeric columns. 
# The function sapply automatically runs the mean function (specified as second argument) on the columns 10 through 41. The means are then saved in the vector named train_data_mean. We use the argument na.rm=TRUE to ask the function to ignore NA entries.

data.mean <- sapply(elect.df[ , 10:41], mean, na.rm=TRUE)

In [None]:
# now replace the remaining NAs with the appropriate attribute mean

for(i in 10:41) {
  elect.df[, i] <- ImputeData(elect.df[ , i], data.mean[i - 9])
}

## Deal with potentially misleading White and Hispanic ethnic data
<div class="alert alert-block alert-warning">


In [None]:
# create a new EthnicTotal attribute column to hold the sum of all the ethnic groupings percentages
# the White, Black, Asian, AmericanIndian, Hawaiian and Hispanic percentages are given in columns 14 to 19

elect.df$EthnicTotal <- rowSums(elect.df[,14:19],na.rm=T)

In [None]:
elect.df$White <- elect.df$White - elect.df$Hispanic/2
elect.df$Hispanic <- elect.df$Hispanic - elect.df$Hispanic/2

# Prepare training and test datasets

<div class="alert alert-block alert-warning">


In [None]:
# Before we can split the dataset on the date we need to first convert 
# the ElectionDate column to the "Date" data type 
elect.df$ElectionDate <- as.Date(elect.df$ElectionDate, format="%m/%d/%Y")

# Create two separate data sets from the data in elect.df.
elect.df.train <- elect.df[elect.df$ElectionDate < as.Date("2/19/2008", format = "%m/%d/%Y"), ]
elect.df.test <- elect.df[elect.df$ElectionDate >= as.Date("2/19/2008", format = "%m/%d/%Y"), ]

In [None]:
# If you want to write these data sets back out into spreadsheets, use the following "write" commands in R.
write.csv(elect.df.train, "electionDataTrain.csv")
write.csv(elect.df.test, "electionDataTest.csv")

# Create some possible independent variables 

<div class="alert alert-block alert-warning">
These variables directly become a part of our data set `elect.df.train`.   

THIS REPLICATES THE ObamaMarginPercent CALCULATED FIELD USED IN TABLEAU

In [None]:
elect.df.train$Obama_margin <- elect.df.train$Obama - elect.df.train$Clinton

In [None]:
elect.df.train$Obama_margin_percent <- elect.df.train$Obama_margin / elect.df.train$TotalVote

In [None]:
elect.df.train$Obama_wins <- ifelse(elect.df.train$Obama_margin > 0, 1, 0)

## R-based analysis and visualisations  

<div class="alert alert-block alert-warning">
Based on findings of the prediction models

In [None]:
filtered_Obama_wins <- elect.df.train[elect.df.train$Obama_wins]
summary(filtered_Obama_wins) 

In [None]:
filtered_south <- elect.df.train[elect.df.train$Region == 'South',]
summary(filtered_south$Obama_margin_percent)

In [None]:
filtered_west <- elect.df.train[elect.df.train$Region == 'West',]
summary(filtered_west$Obama_margin_percent)

In [None]:
filtered_n_east <- elect.df.train[elect.df.train$Region == 'Northeast',]
summary(filtered_n_east$Obama_margin_percent)

In [None]:
filtered_midwest <- elect.df.train[elect.df.train$Region == 'Midwest',]
summary(filtered_midwest$Obama_margin_percent)

In [None]:
filtered_Missouri <- elect.df.train[elect.df.train$State == 'MO',]
summary(filtered_Missouri$Obama_margin_percent)

In [None]:
filtered_Illinois <- elect.df.train[elect.df.train$State == 'IL',]
summary(filtered_Illinois$Obama_margin_percent)

In [None]:
filtered_Minnesota <- elect.df.train[elect.df.train$State == 'MN',]
summary(filtered_Minnesota$Obama_margin_percent)

In [None]:
filtered_Iowa <- elect.df.train[elect.df.train$State == 'IA',]
summary(filtered_Iowa$Obama_margin_percent)

In [None]:
filtered_Nebraska <- elect.df.train[elect.df.train$State == 'NE',]
summary(filtered_Nebraska$Obama_margin_percent)

In [None]:
# Ggplot2 library
library(ggplot2)
 

 
#Graph
qplot( x=Region , y= Obama_margin_percent , data= elect.df.train , geom=c("boxplot","jitter") , fill=Region)

In [None]:
qplot( x=State , y= Obama_margin_percent , data= filtered_midwest , geom=c("boxplot","jitter") , fill=State)

## A best practice in supervised learning is to further split up the training set into a smaller training set and a validation set. 

<div class="alert alert-block alert-warning">
You can compare the performance of candidate models (each trained on the smaller training set) on the validation set. The following code randomly splits your training set into a smaller training set (75% of the training data) and a validation set (25% of the training data).

In [None]:
# Find the number of rows in the training set and Compute the number of rows in the smaller training set.
nTrain <- nrow(elect.df.train)
nSmallTrain <- round(nTrain*0.75)

# Set the seed for a random sample of the row indices in the smaller training set.
set.seed(201)
# Sample the row indices in the smaller training set
rowIndicesSmallerTrain <- sample(1:nTrain, size = nSmallTrain, replace = FALSE)

# Split the training set into the smaller training set and the validation set using these indices. 
elect.df.smaller.train <- elect.df.train[rowIndicesSmallerTrain, ]
elect.df.validation <- elect.df.train[-rowIndicesSmallerTrain, ]

# Linear Regression prediction model

<div class="alert alert-block alert-warning">
We here use the `lm` function to build a simple **linear regression** model predicting `Obama_margin_percent` using 7 arbitrarily selected attributes.

In [None]:
# here we use all the above attributes and Region
lmAll <- lm(Obama_margin_percent ~ Region+MalesPer100Females+AgeBelow35+Age35to65+Age65andAbove+
              White+Black+Asian+AmericanIndian+Hawaiian+Hispanic+HighSchool+Bachelors+Poverty+IncomeAbove75K+
              MedianIncome+AverageIncome+UnemployRate+ManfEmploy+SpeakingNonEnglish+Medicare+MedicareRate+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+Disabilities+DisabilitiesRate+Homeowner+
              SameHouse1995and2000+Pop+PopDensity+LandArea+FarmArea, 
            data = elect.df.train)


In [None]:
#euristically changing the attributes
lm2 <- lm(Obama_margin_percent ~ Region+Black+HighSchool+Bachelors+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+MedicareRate+ Hawaiian + 
              SocialSecurityRate+DisabilitiesRate+Homeowner+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
            data = elect.df.train)


In [None]:
#euristically changing the attributes
lm3 <- lm(Obama_margin_percent ~ Region+Black+HighSchool+Bachelors+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+MedicareRate+ Hawaiian + AgeBelow35 + 
              SocialSecurityRate+DisabilitiesRate+Homeowner+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
            data = elect.df.train)


In [None]:
#Here is just some 7 random attributes
lm7 <- lm(Obama_margin_percent ~ Region + Black + HighSchool + Poverty + PopDensity + SpeakingNonEnglish + LandArea, 
         data = elect.df.smaller.train)


In [None]:
# here is where I did trials and errors, to get this final lm best
lm_best <- lm(Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
            data = elect.df.train)
lm_best_withH <- lm(Obama_margin_percent ~ Region+Age65andAbove+ Homeowner+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
            data = elect.df.train)
#Here I gradually took of some attributes according to: if they were important in the decision tree, if they were kept in the backward or forward stepwise model selection. then I did trial and errors. 

#Black + HighSchool + Region + Poverty + RetiredWorkers + Bachelors + IncomeAbove75K + DisabilitiesRate + 
   # Homeowner + Disabilities + AgeBelow35 + SameHouse1995and2000 + 
    #Asian + AmericanIndian + Hawaiian + FarmArea + LandArea + 
    #PopDensity + SocialSecurity

#Region + Age65andAbove + 
 #   White + Black + AmericanIndian + Hispanic + Bachelors + Poverty + 
  #  IncomeAbove75K + MedianIncome + UnemployRate + ManfEmploy + 
   # MedicareRate + SocialSecurity + SocialSecurityRate + Disabilities + 
    #DisabilitiesRate + Homeowner + SameHouse1995and2000 + PopDensity + 
    #LandArea + FarmArea


# Predict and Test Accuracy: for our  linear models 

<div class="alert alert-block alert-warning">
First define the following custom-defined **`accuracy`** function, which simply calls the **`mae`** and **`rmse`** functions from the **Metrics** package.

In [None]:
accuracy <- function(prediction, actual) 
    cat('MAE =', mae(actual,prediction), ' RMSE =', rmse(actual,prediction),"\n")

In [None]:
lm2.pred <- predict(lm2, elect.df.validation)
lm3.pred <- predict(lm3, elect.df.validation)
lm7.pred <- predict(lm7, elect.df.validation)
lmAll.pred <- predict(lmAll, elect.df.validation)
lm_best.pred <- predict(lm_best, elect.df.validation)
lm_best_withH.pred <- predict(lm_best_withH, elect.df.validation)

In [None]:
cat('lm2:   ')
accuracy(lm2.pred, elect.df.validation$Obama_margin_percent)
cat('lm3:   ')
accuracy(lm3.pred, elect.df.validation$Obama_margin_percent)
cat('lm7:   ')
accuracy(lm7.pred, elect.df.validation$Obama_margin_percent)
cat('lmALL:   ')
accuracy(lmAll.pred, elect.df.validation$Obama_margin_percent)
cat('lm_best:   ')
accuracy(lm_best.pred, elect.df.validation$Obama_margin_percent)
cat('lm_best with Homeowner:   ')
accuracy(lm_best_withH.pred, elect.df.validation$Obama_margin_percent)

## Methods for selecting subsets of predictors: Stepwise Model Selection

<div class="alert alert-block alert-warning">
The **`step`** function in R automatically removes insignificant variables from a regression.   

The argument **`direction = backwards`** tells the function to remove one variable at a time to find best fitting model based on the **AIC** score.   

For more information on this procedure, see **Introduction to Statistical Learning (ISL) pp. 203-210** (http://www-bcf.usc.edu/~gareth/ISL/).

In [None]:
lm.step <- step(lmAll, direction = "backward")

In [None]:
summary(lm.step)  # Which variables did it drop?

In [None]:
#doing it forward
lm.min <- lm(Obama_margin_percent ~ 1, 
         data = elect.df.smaller.train)

lm.step.forward <- step(lm.min, 
                        direction='forward', 
                        scope=Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+HighSchool+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea)

In [None]:
summary(lm.step.forward)  # Which variables did it drop?

### Using these two linear regression models to make forecasts in the validation set.

In [None]:
lmAll.pred <- predict(lmAll, elect.df.validation)
lm_best.pred <- predict(lm_best, elect.df.validation)
lm.step.pred <- predict(lm.step, elect.df.validation)
lm.step.forward.pred <- predict(lm.step.forward, elect.df.validation)
lm_best_withH.pred <- predict(lm_best_withH, elect.df.validation)

In [None]:
cat('lm step backwards: ')
accuracy(lm.step.pred, elect.df.validation$Obama_margin_percent)
cat('lm step forwards:  ')
accuracy(lm.step.forward.pred, elect.df.validation$Obama_margin_percent)
cat('lmAll:             ')
accuracy(lmAll.pred, elect.df.validation$Obama_margin_percent)
cat('lm best:             ')
accuracy(lm_best.pred, elect.df.validation$Obama_margin_percent)
cat('lm_best with Homeowner:   ')
accuracy(lm_best_withH.pred, elect.df.validation$Obama_margin_percent)

In [None]:
summary(elect.df.validation$Obama_margin_percent)

---

# Fit some regression trees using the rpart function

<div class="alert alert-block alert-warning">
Read ISL pp. 303-311.   

A **regression tree** is a way to fit a non-linear model to your data. It recursively subdivides your data into rectangular partitions and takes the average of the y-values in each partition as its prediction.   

Pay particular attention to Figure 8.3 p308 in *Introduction to Statistical Learning* (ISL) (http://www-bcf.usc.edu/~gareth/ISL/).

In [None]:
rt <- rpart(Obama_margin_percent ~ Black + HighSchool + Region + 
    Poverty + RetiredWorkers + Bachelors + IncomeAbove75K + DisabilitiesRate + 
    Homeowner + Disabilities + AgeBelow35 + SameHouse1995and2000 + 
    Asian + AmericanIndian + Hawaiian + FarmArea + LandArea + 
    PopDensity + SocialSecurity, 
            data = elect.df.smaller.train)  # Fits a regression tree.

#prp(rt, type = 1, extra = 1)  # Use prp from the rpart.plot package to plot the tree.

In [None]:
rt.tuned <- rpart(Obama_margin_percent ~ Black + HighSchool + Region + 
    Poverty + RetiredWorkers + Bachelors + IncomeAbove75K + DisabilitiesRate + 
    Homeowner + Disabilities + AgeBelow35 + SameHouse1995and2000 + 
    Asian + AmericanIndian + Hawaiian + FarmArea + LandArea + 
    PopDensity + SocialSecurity, 
                  data = elect.df.smaller.train, 
                  control = rpart.control(cp = 0.005))
#prp(rt.tuned, type = 1, extra = 1)



In [None]:
#this rpart has the best attributes from lm best
rt2 <- rpart(Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+HighSchool+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
                  data = elect.df.smaller.train, )
#prp(rt.tuned2, type = 1, extra = 1)

In [None]:
#this rpart has the best attributes from lm best
rt.tuned2 <- rpart(Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+HighSchool+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
                  data = elect.df.smaller.train, 
                  control = rpart.control(cp = 0.005))

In [None]:
#variation of cp
rt.tuned3 <- rpart(Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+HighSchool+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
                  data = elect.df.smaller.train, 
                  control = rpart.control(cp = 0.002))


In [None]:
#variation of cp
rt.tuned4 <- rpart(Obama_margin_percent ~ Region+Age65andAbove+
              Black+Asian+AmericanIndian+Hawaiian+Bachelors+HighSchool+Poverty+IncomeAbove75K+
              MedianIncome+UnemployRate+ManfEmploy+MedicareRate+Hispanic+
              SocialSecurity+SocialSecurityRate+RetiredWorkers+DisabilitiesRate+
              SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
                  data = elect.df.smaller.train, 
                  control = rpart.control(cp = 0.001))


In [None]:
rt.pred <- predict(rt, elect.df.validation)
rt2.pred <- predict(rt2, elect.df.validation)
rt.tuned.pred <- predict(rt.tuned, elect.df.validation)
rt.tuned2.pred <- predict(rt.tuned2, elect.df.validation)
rt.tuned3.pred <- predict(rt.tuned3, elect.df.validation)
rt.tuned4.pred <- predict(rt.tuned4, elect.df.validation)

In [None]:
cat('lmAll:             ')
accuracy(lmAll.pred, elect.df.validation$Obama_margin_percent)
cat('lm step backwards: ')
accuracy(lm.step.pred, elect.df.validation$Obama_margin_percent)
cat('lm best: ')
accuracy(lm_best.pred, elect.df.validation$Obama_margin_percent)
cat('rpart - untuned:   ')
accuracy(rt.pred, elect.df.validation$Obama_margin_percent)
cat('rpart2 - untuned:   ')
accuracy(rt2.pred, elect.df.validation$Obama_margin_percent)
cat('rpart - tuned:     ')
accuracy(rt.tuned.pred, elect.df.validation$Obama_margin_percent)
cat('rpart - tuned 2:     ')
accuracy(rt.tuned2.pred, elect.df.validation$Obama_margin_percent)
cat('rpart - tuned 3:     ')
accuracy(rt.tuned3.pred, elect.df.validation$Obama_margin_percent)
cat('rpart - tuned 4:     ')
accuracy(rt.tuned4.pred, elect.df.validation$Obama_margin_percent)

In [None]:
# HERE IS XVAL OPTIMISATION APPROACH USED IN DATA ANALYTICS I
rt.tuned2 <- rpart(Obama_margin_percent ~ Region + Black + HighSchool + Poverty + PopDensity + 
                   SpeakingNonEnglish + LandArea, 
                   data = elect.df.smaller.train, 
                   control = rpart.control(cp = 0.003))
# printcp(rt.tuned2)
plotcp(rt.tuned2,upper = "splits")

In [None]:
cor(elect.df.smaller.train$Medicare,elect.df.smaller.train$Disabilities)

In [None]:
cor(elect.df.smaller.train$Black,elect.df.smaller.train$White)

In [None]:
cor(elect.df.smaller.train$Age65andAbove,elect.df.smaller.train$MedicareRate)

In [None]:
cor(elect.df.smaller.train$Age65andAbove,elect.df.smaller.train$SocialSecurityRate)

In [None]:
cor(elect.df.smaller.train$MedicareRate,elect.df.smaller.train$SocialSecurityRate)

In [None]:
cor(elect.df.smaller.train$IncomeAbove75K,elect.df.smaller.train$MedianIncome)

In [None]:
cor(elect.df.smaller.train$IncomeAbove75K,elect.df.smaller.train$AverageIncome)

In [None]:
cor(elect.df.smaller.train$Hispanic,elect.df.smaller.train$SpeakingNonEnglish)

In [None]:
cor(elect.df.smaller.train$Pop,elect.df.smaller.train$PopDensity)

In [None]:
cor(elect.df.smaller.train$HighSchool,elect.df.smaller.train$Bachelor)

In [None]:
corrplot(cor_matrix)

In [None]:
cor_matrix <- as.data.frame(cor(elect.df.smaller.train[,10:41])) # To find all pairwise correlations.
cor_matrix  