## Model-tRacker

In [1]:
options(repr.matrix.max.cols = 1000)
options(repr.matrix.max.rows = 50)
options(warn=-1)

In [7]:
options(scipen=999)

In [6]:
# library(lme4)
library(plyr)
library(dplyr)
# library(car)
# library(caret)
library(data.table)

In [8]:
data <- read.csv("/Users/tdobbins/Documents/TheGeneral/Projects/PriceElasticity/total_premium_data.csv", nrows=2000)

In [10]:
columns <- c('QUOTENUM','DriversOnQuote','DistributionMethod', 'DsStarLevel', 'NamedInsuredAge',
             'NamedInsuredDriverPoints','NamedInsuredMaritalStatus','QuotedPremium',
             'RateBookCode','RateState','TermLength','VehiclesOnQuote','VehiclesWithBI',
             'VehiclesWithFullCoverage','TerritoryCode', 'TransferNormIndicator','COVERAGE',
             'WRITTEN','DEDUCTIBLE','LIMIT','AUYEAR','FULL_COV','ISO_COV_SYMBOL','DRGEN',
             'YEARS_EXP','CREDIT_SC','DRNUM','DsDwelling')
names(data) <- columns

In [11]:
set.seed(321)
data <- data[sample(nrow(data)),]

In [12]:
data$AnnualizedQuotedPremium <- ifelse(data$TermLength==12, data$QuotedPremium*1, data$QuotedPremium*2);
data$NamedInsuredAgeGroup <- ifelse(
    data$NamedInsuredAge <= 25, 0, ifelse(
        between(data$NamedInsuredAge, 25, 39, incbound=TRUE), 1, ifelse(
            between(data$NamedInsuredAge, 40, 59, incbound=TRUE), 2, 3
        )
    )
);
data$AUYEAR <- ifelse(data$AUYEAR <= 1980, 1980, data$AUYEAR);
data$twentyOrUnder <- ifelse(data$NamedInsuredAge <=20, TRUE, FALSE);
data$ISO_CODE_GROUP <- substr(data$ISO_COV_SYMBOL, 1, 1)
data$maritalStatusFlag <- ifelse(data$NamedInsuredMaritalStatus=='Married', 'Married', 'Other');
data$multiCarFlag <- ifelse(data$VehiclesOnQuote > 1, TRUE, FALSE);
data$distMethodBin <- ifelse(data$DistributionMethod %in% c('I', 'G', 'R'), data$DistributionMethod, 'O');
data$LIMIT <- trimws(data$LIMIT)

lessThan25 <- c('25', '25/50', '15/30', '20', '10',
                '5', '10000', '15', '20000', '20/40',
                '3000', '2500', '10/20', '15000', '4500', '5000');
between26_50 <- c('50', '30/60', '50000', '30000', '25/65', '50/100');
greaterThan50 <- c('100', '100/300');

data$LIMIT[data$LIMIT %in% lessThan25] <- '<=25';
data$LIMIT[data$LIMIT %in% between26_50] <- '26-50';
data$LIMIT[data$LIMIT %in% greaterThan50] <- '51+';
data$LIMIT[data$LIMIT == '0'] <- '0';

data$HomeOwner <- ifelse(is.na(data$DsDwelling), FALSE, TRUE)

data$starLevelStr <- ifelse(!is.na(data$DsStarLevel), as.character(data$DsStarLevel), 'N/A');
data$DsStarLevel[is.na(data$DsStarLevel)] <- 0;
data$starLevelNotNA <- ifelse(!is.na(data$DsStarLevel), data$DsStarLevel, 0);

data$DRGEN <- ifelse(
    is.na(data$DRGEN), 'Unknown', data$DRGEN
)


In [13]:
kfolds <- createFolds(data, k=10, list=T, returnTrain=F);
names(kfolds)[1] <- "train";

ERROR: Error in eval(expr, envir, enclos): could not find function "createFolds"


In [14]:
sample <- sample.int(n=nrow(data), size=floor(.3*nrow(data)), replace=F)
train <- data[sample,]
test <- data[-sample,]

In [341]:
getr2 <- function(model){
    return(summary(model)$r.squared)
}

getpr2 <- function(model){
    return(model.glm$rank);
}


In [382]:
if (TRUE == TRUE){
    print('it')
    }

[1] "it"


In [535]:
pvc <- function(model, metrics=NULL, customMetrics=NULL, ...){
    
    kwargs <- list(...);
    
    environmentVars <- list(
        datetime = format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
        username = Sys.info()[["user"]],
        className = class(model)[[1]]
    )
    
    form <- data.frame(formula=paste(deparse(formula(model)), collapse=''));
    
    for (env in names(environmentVars)){
        form[[env]] <-  environmentVars[[env]];
    }

    if (!is.null(metrics)){
        for (metric in metrics){
            form[[metric]]  <- summary(model)[[metric]];
        }
    }

    if (!is.null(customMetrics)){
        for (cmetric in customMetrics){
            form[[cmetric]] <- get(cmetric)(model);
        }
    }

    for (arg in names(kwargs)){
        form[[arg]] <- kwargs[[arg]];
    }
    
    DIRTY_METRICS <- FALSE;
    
    if (exists('METRICS_LIST')){
        
        DIRTY_METRICS <- FALSE %in% 
            unique(
                append(
                    names(kwargs), unlist(
                        append(
                            metrics, customMetrics
                        )
                    )
                ) == METRICS_LIST
            )
        
        if (DIRTY_METRICS){
            MMODELS <<- form;
            print('false')
        }
    }
    
    METRICS_LIST <<- append(
        names(kwargs), unlist(
            append(
                metrics, customMetrics
            )
        )
    )

    if (exists('MODELS')){
        COUNTER <<- COUNTER + 1;
    } else {
        MODELS <<- form;
        COUNTER <<- 1;
    }
    
    if (exists('METRICS_LIST') & DIRTY_METRICS){

        for (env in names(environmentVars)){
            MMODELS[[env]] <<-  environmentVars[[env]];
        }

        if (!is.null(metrics)){
            for (metric in metrics){
                MMODELS[[metric]]  <<- summary(model)[[metric]];
            }
        }

        if (!is.null(customMetrics)){
            for (cmetric in customMetrics){
                MMODELS[[cmetric]] <<- get(cmetric)(model);
            }
        }

        for (arg in names(kwargs)){
            MMODELS[[arg]] <<- kwargs[[arg]];
        }
    }
    
    if (COUNTER > 1){
        if (DIRTY_METRICS){
            combinedDf <- rbind.fill(MODELS, MMODELS);
            MODELS <<- combinedDf[with(combinedDf, order(datetime, decreasing=TRUE)),];
        } else {
            combinedDf <- rbind.fill(MODELS, form);
            MODELS <<- combinedDf[with(combinedDf, order(datetime, decreasing=TRUE)),];
        }
    }
    return(form);
}

In [545]:
MODELS

Unnamed: 0,formula,datetime,username,className,getpr2,this,foo,baz,d
4,"AnnualizedQuotedPremium ~ (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + poly(DriversOnQuote, 2) + poly(VehiclesOnQuote, 2) + HomeOwner + DsStarLevel + factor(FULL_COV)",2017-06-15 22:52:26,tdobbins,glm,32,this,bar,ththth,
1,"AnnualizedQuotedPremium ~ (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + poly(DriversOnQuote, 2) + poly(VehiclesOnQuote, 2) + (VehiclesWithFullCoverage * VehiclesWithBI) + factor(TransferNormIndicator) + NamedInsuredDriverPoints + factor(distMethodBin) + factor(ISO_CODE_GROUP) + poly(AUYEAR, 2) + HomeOwner + DsStarLevel + factor(FULL_COV)",2017-06-15 22:52:17,tdobbins,glm,47,this,bar,ththth,
2,"AnnualizedQuotedPremium ~ (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + poly(DriversOnQuote, 2) + poly(VehiclesOnQuote, 2) + (VehiclesWithFullCoverage * VehiclesWithBI) + factor(TransferNormIndicator) + NamedInsuredDriverPoints + factor(distMethodBin) + factor(ISO_CODE_GROUP) + poly(AUYEAR, 2) + HomeOwner + DsStarLevel + factor(FULL_COV)",2017-06-15 22:52:09,tdobbins,glm,47,this,bar,ththth,gfds
3,"AnnualizedQuotedPremium ~ (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + poly(DriversOnQuote, 2) + poly(VehiclesOnQuote, 2) + (VehiclesWithFullCoverage * VehiclesWithBI) + factor(TransferNormIndicator) + NamedInsuredDriverPoints + factor(distMethodBin) + factor(ISO_CODE_GROUP) + poly(AUYEAR, 2) + HomeOwner + DsStarLevel + factor(FULL_COV)",2017-06-15 22:52:04,tdobbins,glm,47,,,,


In [544]:
time1 <- Sys.time();

model.glm <- glm(
    AnnualizedQuotedPremium ~ 
        (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + 
        poly(DriversOnQuote, 2) +
        poly(VehiclesOnQuote, 2) + 
#         (VehiclesWithFullCoverage * VehiclesWithBI) +
#         factor(TransferNormIndicator) +
#         NamedInsuredDriverPoints + 
#         factor(distMethodBin) +
#         factor(ISO_CODE_GROUP) +
#         poly(AUYEAR, 2) +
        HomeOwner +
        DsStarLevel +
        factor(FULL_COV),
    data=train, 
    family=Gamma(link="log")
)

time2 <- Sys.time() - time1;
print(time2);

pvc(model.glm, metrics=list('link', 'sigma'), customMetrics=list('getr2', 'getpr2'), this='this', foo='bar', baz='ththth');

Time difference of 0.02617693 secs


formula,datetime,username,className,getpr2,this,foo,baz
"AnnualizedQuotedPremium ~ (factor(DRGEN) * poly(NamedInsuredAge, 3) * maritalStatusFlag) + poly(DriversOnQuote, 2) + poly(VehiclesOnQuote, 2) + HomeOwner + DsStarLevel + factor(FULL_COV)",2017-06-15 22:52:26,tdobbins,glm,32,this,bar,ththth


In [536]:
rm(MODELS);rm(COUNTER);rm(MMODELS);rm(METRICS_LIST)

In [124]:
train.sub <- train[
#     !is.na(model.glmer.data$DRGEN) &
    !is.na(train$DsStarLevel),
]

In [48]:
output.train <- cbind(
    train.sub, 
    predict=fitted(model.glmer), 
    resid=resid(model.glmer)
)

meanErr <- mean(abs(output.train$AnnualizedQuotedPremium - output.train$predict))
sdErr <- sd(abs(output.train$AnnualizedQuotedPremium - output.train$predict))

In [49]:
threshold <- 0.2;
thresholdRate <- nrow(
    output.train[
        output.train$AnnualizedQuotedPremium*(1-threshold) < 
        output.train$predict &
        output.train$predict <
        output.train$AnnualizedQuotedPremium*(1+threshold),
    ]
)/nrow(output.train)

In [50]:
filteredData <- output.train[output.train$AnnualizedQuotedPremium > output.train$predict,];

output.train$adjustedresdiuals <- round(abs(output.train$WRITTEN - fitted(model.glmer)), 3);
filteredData$accuracyRate <- 100/(filteredData$AnnualizedQuotedPremium/abs(filteredData$AnnualizedQuotedPremium-filteredData$predict))
output.train$accuracyRate <- 100/(output.train$AnnualizedQuotedPremium/abs(output.train$AnnualizedQuotedPremium-output.train$predict))

accuracyRate <- mean(output.train$accuracyRate);

In [51]:
# latest.csv
rmse <- sqrt(mean((output.train$AnnualizedQuotedPremium - output.train$predict)^2));
cat(
    sprintf("
         MEAN ERROR:    %s
         STDV ERROR:    %s
         THRESHOLD:     %s
         RMSE:          %s
         ACCURACY RATE: %s", meanErr, sdErr, thresholdRate, rmse, accuracyRate
    )
)


         MEAN ERROR:    529.698448309204
         STDV ERROR:    719.271289585662
         THRESHOLD:     0.685
         RMSE:          892.787422729152
         ACCURACY RATE: 17.6524573592468

In [None]:
groud truth: 0 = 654707, 1 = 45293
classifier:  0 = 617307, 1 = 82693
Total: 700000
observed accuracy: ((45293 + 617307)/700000) = 0.946571428571429
expected accuracy: ((654707*617307/700000)+(45293*82693/700000))/700000 = 0.83245026142449
kappa: (0.946571428571429 - 0.83245026142449)/(1-0.83245026142449) = 0.6811181450785