In [None]:
library(ellipse)
library(RColorBrewer)
library(PerformanceAnalytics)
library(RCurl)
library(rpart)
library(rpart.plot)
library(cvTools)
library(glmnet)
library(forecast) 
library(Metrics)

In [None]:
elect.df  <- read.csv('Obama.csv')

In [None]:
# Replace all NAs in a data vector (vec) with some predefined value (replaceValue)

ImputeData <- function(vec, replaceValue) {
  ifelse(is.na(vec), replaceValue, vec)
}

# Apply to attributes: ManfEmploy, Black, Asian, AmericanIndian, FarmArea

for(attribute in c("ManfEmploy", "Black", "Asian", "AmericanIndian", "FarmArea")) {
  elect.df[, attribute] <- ImputeData(elect.df[ , attribute], 0)
}

# Means for all the numeric columns. 
# Sapply automatically runs the mean function on the columns 10 through 41.

data.mean <- sapply(elect.df[ , 10:41], mean, na.rm=TRUE)

# Replace remaining NAs with the appropriate attribute mean

for(i in 10:41) {
  elect.df[, i] <- ImputeData(elect.df[ , i], data.mean[i - 9])
}


elect.df$EthnicTotal <- rowSums(elect.df[,14:19],na.rm=T)

elect.df$White <- elect.df$White - elect.df$Hispanic/2
elect.df$Hispanic <- elect.df$Hispanic - elect.df$Hispanic/2

# Convert ElectionDate column to the "Date" data type 
elect.df$ElectionDate <- as.Date(elect.df$ElectionDate, format="%m/%d/%Y")

# Create two separate data sets from the data in elect.df.
elect.df.train <- elect.df[elect.df$ElectionDate < as.Date("2/19/2008", format = "%m/%d/%Y"), ]
elect.df.test <- elect.df[elect.df$ElectionDate >= as.Date("2/19/2008", format = "%m/%d/%Y"), ]

# Write back into spreadsheets
write.csv(elect.df.train, "electionDataTrain.csv")
write.csv(elect.df.test, "electionDataTest.csv")


elect.df.train$Obama_margin <- elect.df.train$Obama - elect.df.train$Clinton

elect.df.train$Obama_margin_percent <- elect.df.train$Obama_margin / elect.df.train$TotalVote

elect.df.train$Obama_wins <- ifelse(elect.df.train$Obama_margin > 0, 1, 0)


# Find the number of rows in the training set
# Compute the number of rows in the smaller training set
nTrain <- nrow(elect.df.train)
nSmallTrain <- round(nTrain*0.75)

# Set the seed for a random sample of the row indices in the smaller training set
set.seed(201)
rowIndicesSmallerTrain <- sample(1:nTrain, size = nSmallTrain, replace = FALSE)

# Split the training set into the smaller training set and the validation set using these indices
elect.df.smaller.train <- elect.df.train[rowIndicesSmallerTrain, ]
elect.df.validation <- elect.df.train[-rowIndicesSmallerTrain, ]


lm_obamaVunder35 <- lm(Obama_margin_percent ~ AgeBelow35, 
                       data = elect.df.smaller.train)

summary(lm_obamaVunder35)

In [None]:
### Correlation Matrix ###

# Libraries
library(ellipse)
library(RColorBrewer)

data=cor(elect.df.smaller.train[,c(44,10:41)])
 
# Panel of 100 colors with Rcolor Brewer
my_colors <- brewer.pal(5, "Spectral")
my_colors=colorRampPalette(my_colors)(100)
 
# Order the correlation matrix
ord <- order(data[1, ])
data_ord = data[ord, ord]
plotcorr(data_ord , col=my_colors[data_ord*50+50], mar = c(0,0,0,0),cex.lab=0.75  )

In [None]:
### Linear Regression ###

lmAll <- lm(Obama_margin_percent ~ Region+MalesPer100Females+AgeBelow35+Age35to65+Age65andAbove+
         White+Black+Asian+AmericanIndian+Hawaiian+Hispanic+HighSchool+Bachelors+Poverty+IncomeAbove75K+
         MedianIncome+AverageIncome+UnemployRate+ManfEmploy+SpeakingNonEnglish+Medicare+MedicareRate+
         SocialSecurity+SocialSecurityRate+RetiredWorkers+Disabilities+DisabilitiesRate+Homeowner+
         SameHouse1995and2000+Pop+PopDensity+LandArea+FarmArea, 
         data = elect.df.smaller.train)
summary(lmAll)


lm.tuned <- lm(Obama_margin_percent ~ Region+AgeBelow35+Age35to65+Age65andAbove+
         Black+Asian+Hawaiian+AmericanIndian+Bachelors+HighSchool+Poverty+
         MedianIncome+UnemployRate+ManfEmploy+Medicare+
         SocialSecurity+SpeakingNonEnglish+RetiredWorkers+DisabilitiesRate+Homeowner+
         SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
         data = elect.df.smaller.train)

summary(lm.tuned)


# Test accuracy

accuracy <- function(prediction, actual) 
    cat('MAE =', mae(actual,prediction), ' RMSE =', rmse(actual,prediction),"\n")
    
    

lmAll.pred <- predict(lmAll, elect.df.validation)

cat('lmAll:   ')
accuracy(lmAll.pred, elect.df.validation$Obama_margin_percent)


lm.tuned.pred <- predict(lm.tuned, elect.df.validation)

cat('lm.tuned:   ')
accuracy(lm.tuned.pred, elect.df.validation$Obama_margin_percent)

In [None]:
### Linear Regression: Stepwise Method ###

lmAll <- lm(Obama_margin_percent ~ Region+MalesPer100Females+AgeBelow35+Age35to65+Age65andAbove+
         White+Black+Asian+AmericanIndian+Hawaiian+Hispanic+HighSchool+Bachelors+Poverty+IncomeAbove75K+
         MedianIncome+AverageIncome+UnemployRate+ManfEmploy+SpeakingNonEnglish+Medicare+MedicareRate+
         SocialSecurity+SocialSecurityRate+RetiredWorkers+Disabilities+DisabilitiesRate+Homeowner+
         SameHouse1995and2000+Pop+PopDensity+LandArea+FarmArea, 
         data = elect.df.smaller.train)
summary(lmAll)



lm.tuned <- lm(Obama_margin_percent ~ Region+AgeBelow35+Age35to65+Age65andAbove+
         Black+Asian+Hawaiian+AmericanIndian+Bachelors+HighSchool+Poverty+
         MedianIncome+UnemployRate+ManfEmploy+Medicare+
         SocialSecurity+SpeakingNonEnglish+RetiredWorkers+DisabilitiesRate+Homeowner+
         SameHouse1995and2000+PopDensity+LandArea+FarmArea, 
         data = elect.df.smaller.train)
summary(lm.tuned)


lm.step <- step(lmAll, direction = "backward")

summary(lm.step)  # Which variables did it drop?


lm.step.tuned <- step(lm.tuned, direction = "backward")

summary(lm.step.tuned)  # Which variables did it drop?



# Test accuracy

lm.step.pred <- predict(lm.step, elect.df.validation)
cat('lm step backwards: ')
accuracy(lm.step.pred, elect.df.validation$Obama_margin_percent)


lm.step.tuned.pred <- predict(lm.step.tuned, elect.df.validation)
cat('lm step backwards tuned: ')
accuracy(lm.step.tuned.pred, elect.df.validation$Obama_margin_percent)

In [None]:
### Regression trees ###

rt <- rpart(Obama_margin_percent ~ Region+MalesPer100Females+AgeBelow35+Age35to65+Age65andAbove+
        White+Black+Asian+AmericanIndian+Hawaiian+Hispanic+HighSchool+Bachelors+Poverty+IncomeAbove75K+
        MedianIncome+AverageIncome+UnemployRate+ManfEmploy+SpeakingNonEnglish+Medicare+MedicareRate+
        SocialSecurity+SocialSecurityRate+RetiredWorkers+Disabilities+DisabilitiesRate+Homeowner+
        SameHouse1995and2000+Pop+PopDensity+LandArea+FarmArea,
            data = elect.df.smaller.train)

prp(rt, type = 1, extra = 1)



rt.tuned <- rpart(Obama_margin_percent ~ Region+AgeBelow35+Age35to65+Age65andAbove+
         Black+Asian+AmericanIndian+Hawaiian+Hispanic+HighSchool+Bachelors+IncomeAbove75K+
         MedianIncome+AverageIncome+ManfEmploy+SpeakingNonEnglish+Medicare+MedicareRate+
         SocialSecurity+SocialSecurityRate+RetiredWorkers+Homeowner+
         SameHouse1995and2000+Pop+PopDensity+LandArea+FarmArea, 
            data = elect.df.smaller.train)  # Fits a regression tree

prp(rt.tuned, type = 1, extra = 1)  # prp from the rpart.plot package to plot the tree


# Test accuracy

rt.pred <- predict(rt, elect.df.validation)
cat('rpart - untuned:   ')
accuracy(rt.pred, elect.df.validation$Obama_margin_percent)


rt.tuned.pred <- predict(rt.tuned, elect.df.validation)
cat('rpart - tuned:   ')
accuracy(rt.tuned.pred, elect.df.validation$Obama_margin_percent)

In [None]:
### XGBOOST ###

library(Matrix)
library(dply)
library(magrittr)
library(xgboost)

trainm <- sparse.model.matrix( Obama_margin_percent ~ ., data = elect.df.smaller.train)


# Train data into matrix
train_label <- elect.df.smaller.train[,"Obama_margin_percent"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)

# Test Data into matrix 
testm <- sparse.model.matrix( Obama_margin_percent ~ ., data = elect.df.validation)
test_label <- elect.df.validation[,"Obama_margin_percent"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm ), label = test_label)


xbg_params <- list("objective" = "multi:softprob", "eval_metric" = "mlogloss" )

watchlist <- list(train = train_matrix , test = test_matrix)


bst_model <- xgb.train( data = train_matrix , nrounds = 100 , watchlist = watchlist , num_class =1)


# Test accuracy

pred.21 <- predict(bst_model, test_matrix)

cat('xgboost:             ')
accuracy(pred.21, test_label)