# 1 - Load and prepare data

In [32]:
library(dplyr, warn.conflicts=FALSE)
library(randomForest)
library(ROCR)

#------------------------------
# I set the working directory.
#------------------------------

setwd("/home/thibault/Documents/oilprediction_challenge/")

#---------------------------------------------------------------
# Load train and test set
#---------------------------------------------------------------

train <- read.csv("./Data/Train.csv", sep = ";",stringsAsFactors = FALSE)
Y_train <- read.csv("./Data/Y_train.csv", sep = ";",stringsAsFactors = FALSE)
test <- read.csv("./Data/Test.csv", sep = ";",stringsAsFactors = FALSE)

#--------------------------------
# We combine the two datasets.
#--------------------------------

full_dataset <- rbind(train, test)


head(full_dataset)

ID,month,country,X1_diffClosing.stocks.kmt.,X1_diffExports.kmt.,X1_diffImports.kmt.,X1_diffRefinery.intake.kmt.,X1_diffWTI,X1_diffSumClosing.stocks.kmt.,X1_diffSumExports.kmt.,⋯,X12_diffClosing.stocks.kmt.,X12_diffExports.kmt.,X12_diffImports.kmt.,X12_diffRefinery.intake.kmt.,X12_diffWTI,X12_diffSumClosing.stocks.kmt.,X12_diffSumExports.kmt.,X12_diffSumImports.kmt.,X12_diffSumProduction.kmt.,X12_diffSumRefinery.intake.kmt.
ID00001,5,52,0.0,237.0,0.0,33.0,-5.62,10426.2462,12135.102,⋯,0.0,0.0,0.0,14.0,2.92,673.8569,-13007.152,-5932.5482,-17130.833,-13151.456
ID00002,12,69,28.0,-34.0,60.0,7.0,-5.95,195.2005,7500.103,⋯,-43.0,-129.0,-67.0,-16.0,-5.62,3217.0304,-6563.642,-3587.5152,-13053.659,-13005.05
ID00003,5,74,0.0,0.0,-29.1227,-85.5816,-5.62,10426.2462,12135.102,⋯,0.0,0.0,-17.8667,-13.2214,2.92,673.8569,-13007.152,-5932.5482,-17130.833,-13151.456
ID00004,11,34,175.3701,91.5209,0.0,339.4347,7.66,-2247.4834,-6806.122,⋯,81.8304,-579.9462,0.0,16.6891,-8.62,-2929.8903,1072.847,504.0197,-4827.23,3962.629
ID00005,7,2,-550.0674,251.9568,0.0,-49.5277,-12.07,-2652.6804,2165.712,⋯,-1127.1255,-512.5506,0.0,-2.0243,7.2,2513.6243,-1732.443,-4827.1576,-7242.551,-13079.225
ID00006,12,18,-5.0,0.0,54.0,0.0,-6.41,4667.861,5818.622,⋯,-93.0,0.0,-159.0,4.0,-7.48,-321.0981,-2198.87,-3970.4225,-8308.61,-9271.91


# Modeling

In [8]:
full_train = left_join(train, Y_train)

head(full_train)

Joining, by = "ID"


ID,month,country,X1_diffClosing.stocks.kmt.,X1_diffExports.kmt.,X1_diffImports.kmt.,X1_diffRefinery.intake.kmt.,X1_diffWTI,X1_diffSumClosing.stocks.kmt.,X1_diffSumExports.kmt.,⋯,X12_diffExports.kmt.,X12_diffImports.kmt.,X12_diffRefinery.intake.kmt.,X12_diffWTI,X12_diffSumClosing.stocks.kmt.,X12_diffSumExports.kmt.,X12_diffSumImports.kmt.,X12_diffSumProduction.kmt.,X12_diffSumRefinery.intake.kmt.,Target
ID00001,5,52,0.0,237.0,0.0,33.0,-5.62,10426.2462,12135.102,⋯,0.0,0.0,14.0,2.92,673.8569,-13007.152,-5932.5482,-17130.833,-13151.456,1
ID00002,12,69,28.0,-34.0,60.0,7.0,-5.95,195.2005,7500.103,⋯,-129.0,-67.0,-16.0,-5.62,3217.0304,-6563.642,-3587.5152,-13053.659,-13005.05,0
ID00003,5,74,0.0,0.0,-29.1227,-85.5816,-5.62,10426.2462,12135.102,⋯,0.0,-17.8667,-13.2214,2.92,673.8569,-13007.152,-5932.5482,-17130.833,-13151.456,0
ID00004,11,34,175.3701,91.5209,0.0,339.4347,7.66,-2247.4834,-6806.122,⋯,-579.9462,0.0,16.6891,-8.62,-2929.8903,1072.847,504.0197,-4827.23,3962.629,0
ID00005,7,2,-550.0674,251.9568,0.0,-49.5277,-12.07,-2652.6804,2165.712,⋯,-512.5506,0.0,-2.0243,7.2,2513.6243,-1732.443,-4827.1576,-7242.551,-13079.225,1
ID00006,12,18,-5.0,0.0,54.0,0.0,-6.41,4667.861,5818.622,⋯,0.0,-159.0,4.0,-7.48,-321.0981,-2198.87,-3970.4225,-8308.61,-9271.91,1


In [20]:
variables_in_model <- paste(names(full_train)[!(names(full_train) %in% c("ID", "Target"))], collapse="+")

formula_rf = as.formula(paste("as.factor(Target) ~", variables_in_model, sep=""))

In [34]:
rf_output <- randomForest(formula_rf,
                          data=full_train, 
                          importance=TRUE, 
                          ntree=20)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


# Evaluation

In [33]:
predictions=as.vector(rf_output$votes[,2])
pred=prediction(predictions,target)

perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC@y.values[[1]]

perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))

ERROR: Error in as.vector(rf_output$votes[, 2]): object 'rf_output' not found
