Permalink
Cannot retrieve contributors at this time
### Code (C) Matthew Theisen 2018. Contact at mtheisen@ucla.edu. | |
library(ISLR) | |
library(glmnet) | |
split_data <- function(df,test_frac) { | |
mask <- runif(nrow(df))<test_frac | |
train_x = scale( df[!mask,colnames(df)!='Purchase'] ) ## Scaling so that regularization works properly. | |
test_x = scale( df[mask,colnames(df)!='Purchase'], | |
center=attr(train_x,"scaled:center"), | |
scale=attr(train_x,"scaled:scale")) ## Should be scaled separately, since otherwise we are using information from the test set for the train set! A form of data leakage. | |
ret <- list(train_x=train_x, | |
train_y=df[!mask,c('Purchase')], | |
test_x=test_x, | |
test_y=df[mask,c('Purchase')]) | |
return (ret) | |
} | |
expit <- function(x) { | |
return ( 1/(1+exp(-x)) ) | |
} | |
logit <- function(x) { | |
return( log( x/(1-x) ) ) | |
} | |
matrix_from_df <- function(x) { | |
return (Matrix(as.matrix(x), sparse = FALSE)) | |
} | |
test_splits <- function(splits) { | |
print("Training Rows") | |
print(nrow(splits$train_x)) | |
print("Testing Rows") | |
print(splits$test_x) | |
print("Training Means") | |
print(colMeans(splits$train_x) ) | |
print("Test Means") | |
print(colMeans(splits$test_x) ) | |
print("Test SDs") | |
print(apply(splits$test_x,2,sd)) | |
print("Train SDs") | |
print(apply(splits$train_x,2,sd)) | |
} | |
get_confusion_matrix <- function(test,pred) { | |
df=data.frame(test=test,pred=pred) | |
df$pred <- as.factor(df$pred) | |
levels(df$pred)<-c("FALSE","TRUE") | |
return( table(df) ) | |
} | |
get_conf_stats <- function(cm) { | |
df <- data.frame(cm) | |
precision <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$pred==TRUE) ),]$Freq ) | |
recall <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$test=="Yes") ),]$Freq ) | |
f1 <- 2/(1/precision + 1/recall) | |
accuracy <- sum(df[( (df$test=="Yes") == (df$pred==TRUE) ),]$Freq) / sum(df$Freq) | |
cost_of_one <- 1 | |
profit_of_one <- 10 | |
overall_profit <- profit_of_one*df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq - cost_of_one*sum( df[( (df$pred==TRUE) ),]$Freq ) | |
return (list(precision=precision, recall=recall, f1=f1, accuracy=accuracy, profit=overall_profit)) | |
} | |
lin_reg_fit <- function(splits) { | |
#### Fit & Predict | |
fit <- glmnet( matrix_from_df(splits$train_x), splits$train_y, family='binomial') | |
train_predict <- predict(fit,newx=matrix_from_df(splits$train_x) ) | |
test_predict <- predict(fit,newx=matrix_from_df(splits$test_x) ) | |
#### Do quick first tests | |
get_confusion_matrix(splits$test_y,test_predict[,100]>0) | |
colSums( test_predict > 0 ) | |
#### Find good lambdas at decision boundary = 0.1 | |
num_pts <- 91 | |
graphing_df <- data.frame(lambda=rep(0,num_pts), train_f1=rep(0,num_pts), test_f1=rep(0,num_pts) ) | |
multiplier <- as.integer(100/num_pts) | |
i=0 | |
for ( row in 10:100 ) { | |
# row <- multiplier*i | |
i=i+1 | |
test_pred_y <- test_predict[,row] > logit(0.1) | |
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y ) | |
test_conf_stats <- get_conf_stats(test_conf_mat) | |
train_pred_y <- train_predict[,row] > logit(0.1) | |
train_conf_mat <- get_confusion_matrix( splits$train_y,train_pred_y ) | |
train_conf_stats <- get_conf_stats(train_conf_mat) | |
graphing_df$lambda[i] <- fit$lambda[row] | |
graphing_df$test_f1[i] <- test_conf_stats[['f1']] | |
graphing_df$train_f1[i] <- train_conf_stats[['f1']] | |
} | |
plot(graphing_df$lambda,graphing_df$test_f1,xlim=c(0,0.015),ylim=c(0.15,0.3),xlab='Lambda', ylab='Test F1', | |
type="l") | |
#### MOST SIGNIFICANT FACTORS | |
row <- 10 | |
sig_names <- rownames( coef(fit) ) [ coef(fit)[,row]!=0 ] | |
coefs <- coef(fit)[,row][sig_names] | |
print(coefs) | |
#### Find Most Profitable Lambdas By Cutoff | |
fit_rows <- c(100,50,10) | |
cutoffs <- (1:20)/50 | |
data_storage <- list(low=data.frame(cut=cutoffs,profit=rep(0,20)), | |
middle=data.frame(cut=cutoffs,profit=rep(0,20)), | |
high=data.frame(cut=cutoffs,profit=rep(0,20)) ) | |
list_labels <- c('low','middle','high') | |
for (i in 1:20) { | |
cutoff <- cutoffs[i] | |
for (k in 1:3) { | |
row <- fit_rows[k] | |
test_pred_y <- test_predict[,row] > logit(cutoff) | |
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y ) | |
test_conf_stats <- get_conf_stats(test_conf_mat) | |
data_storage[[list_labels[k]]]$profit[i] <- test_conf_stats[['profit']] | |
} | |
} | |
colors = list(low='red',middle='blue',high='green') | |
plot(1,type='n',xlim=c(0,0.4),ylim=c(-200,200),xlab='Probability Cutoff','Profit ($)') | |
for (label in list_labels) { | |
lines( data_storage[[label]]$cut, data_storage[[label]]$profit, col=colors[[label]] ) | |
} | |
legend(0.3,y=200,legend=c('low','medium','high'),fill=c('red','blue','green')) | |
} | |
splits <- split_data(Caravan,0.2) | |
test_splits(splits) | |
lin_reg_fit(splits) |