Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
138 lines (115 sloc) 4.68 KB
### Code (C) Matthew Theisen 2018. Contact at mtheisen@ucla.edu.
library(ISLR)
library(glmnet)
split_data <- function(df,test_frac) {
mask <- runif(nrow(df))<test_frac
train_x = scale( df[!mask,colnames(df)!='Purchase'] ) ## Scaling so that regularization works properly.
test_x = scale( df[mask,colnames(df)!='Purchase'],
center=attr(train_x,"scaled:center"),
scale=attr(train_x,"scaled:scale")) ## Should be scaled separately, since otherwise we are using information from the test set for the train set! A form of data leakage.
ret <- list(train_x=train_x,
train_y=df[!mask,c('Purchase')],
test_x=test_x,
test_y=df[mask,c('Purchase')])
return (ret)
}
expit <- function(x) {
return ( 1/(1+exp(-x)) )
}
logit <- function(x) {
return( log( x/(1-x) ) )
}
matrix_from_df <- function(x) {
return (Matrix(as.matrix(x), sparse = FALSE))
}
test_splits <- function(splits) {
print("Training Rows")
print(nrow(splits$train_x))
print("Testing Rows")
print(splits$test_x)
print("Training Means")
print(colMeans(splits$train_x) )
print("Test Means")
print(colMeans(splits$test_x) )
print("Test SDs")
print(apply(splits$test_x,2,sd))
print("Train SDs")
print(apply(splits$train_x,2,sd))
}
get_confusion_matrix <- function(test,pred) {
df=data.frame(test=test,pred=pred)
df$pred <- as.factor(df$pred)
levels(df$pred)<-c("FALSE","TRUE")
return( table(df) )
}
get_conf_stats <- function(cm) {
df <- data.frame(cm)
precision <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$pred==TRUE) ),]$Freq )
recall <- df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq / sum( df[( (df$test=="Yes") ),]$Freq )
f1 <- 2/(1/precision + 1/recall)
accuracy <- sum(df[( (df$test=="Yes") == (df$pred==TRUE) ),]$Freq) / sum(df$Freq)
cost_of_one <- 1
profit_of_one <- 10
overall_profit <- profit_of_one*df[( (df$test=="Yes") & (df$pred==TRUE) ),]$Freq - cost_of_one*sum( df[( (df$pred==TRUE) ),]$Freq )
return (list(precision=precision, recall=recall, f1=f1, accuracy=accuracy, profit=overall_profit))
}
lin_reg_fit <- function(splits) {
#### Fit & Predict
fit <- glmnet( matrix_from_df(splits$train_x), splits$train_y, family='binomial')
train_predict <- predict(fit,newx=matrix_from_df(splits$train_x) )
test_predict <- predict(fit,newx=matrix_from_df(splits$test_x) )
#### Do quick first tests
get_confusion_matrix(splits$test_y,test_predict[,100]>0)
colSums( test_predict > 0 )
#### Find good lambdas at decision boundary = 0.1
num_pts <- 91
graphing_df <- data.frame(lambda=rep(0,num_pts), train_f1=rep(0,num_pts), test_f1=rep(0,num_pts) )
multiplier <- as.integer(100/num_pts)
i=0
for ( row in 10:100 ) {
# row <- multiplier*i
i=i+1
test_pred_y <- test_predict[,row] > logit(0.1)
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y )
test_conf_stats <- get_conf_stats(test_conf_mat)
train_pred_y <- train_predict[,row] > logit(0.1)
train_conf_mat <- get_confusion_matrix( splits$train_y,train_pred_y )
train_conf_stats <- get_conf_stats(train_conf_mat)
graphing_df$lambda[i] <- fit$lambda[row]
graphing_df$test_f1[i] <- test_conf_stats[['f1']]
graphing_df$train_f1[i] <- train_conf_stats[['f1']]
}
plot(graphing_df$lambda,graphing_df$test_f1,xlim=c(0,0.015),ylim=c(0.15,0.3),xlab='Lambda', ylab='Test F1',
type="l")
#### MOST SIGNIFICANT FACTORS
row <- 10
sig_names <- rownames( coef(fit) ) [ coef(fit)[,row]!=0 ]
coefs <- coef(fit)[,row][sig_names]
print(coefs)
#### Find Most Profitable Lambdas By Cutoff
fit_rows <- c(100,50,10)
cutoffs <- (1:20)/50
data_storage <- list(low=data.frame(cut=cutoffs,profit=rep(0,20)),
middle=data.frame(cut=cutoffs,profit=rep(0,20)),
high=data.frame(cut=cutoffs,profit=rep(0,20)) )
list_labels <- c('low','middle','high')
for (i in 1:20) {
cutoff <- cutoffs[i]
for (k in 1:3) {
row <- fit_rows[k]
test_pred_y <- test_predict[,row] > logit(cutoff)
test_conf_mat <- get_confusion_matrix( splits$test_y,test_pred_y )
test_conf_stats <- get_conf_stats(test_conf_mat)
data_storage[[list_labels[k]]]$profit[i] <- test_conf_stats[['profit']]
}
}
colors = list(low='red',middle='blue',high='green')
plot(1,type='n',xlim=c(0,0.4),ylim=c(-200,200),xlab='Probability Cutoff','Profit ($)')
for (label in list_labels) {
lines( data_storage[[label]]$cut, data_storage[[label]]$profit, col=colors[[label]] )
}
legend(0.3,y=200,legend=c('low','medium','high'),fill=c('red','blue','green'))
}
splits <- split_data(Caravan,0.2)
test_splits(splits)
lin_reg_fit(splits)