Skip to content
Probability a user will click on a given article
R
Branch: master
Clone or download

Latest commit

Fetching latest commit…
Cannot retrieve the latest commit at this time.

Files

Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
About
data
figure
scripts
.RData
.Rhistory
.gitignore
Icon
LICENSE
README.md
data.Rproj

README.md

Author: Tim Siwula

Proposal: http://bit.ly/2gcCLQ4

Kaggle: http://bit.ly/2gMVpPG

Github: http://bit.ly/2gZoTwy

Data: http://bit.ly/2fQ0LHW


Project description

We are trying to predict the click probability for a given article displayed to a user online. The target variable is if a user clicked or not. We are working on featuring engineering given a multitude of data sets. The target variable name is click_prob.


output: pdf_document library(knitr) library(markdown)

#transform the .Rmd to a markdown (.md) file. knit('tims_script.Rmd')

########################### SET UP DATABASE CONNECTION ################################

require("RPostgreSQL")    #install.packages("RPostgreSQL")
driver <- dbDriver("PostgreSQL")   # loads the PostgreSQL driver

# creates a connection to the postgres database
# note that "con" will be used later in each connection to the database
connection <- dbConnect(driver, dbname = "clickprediction",
                 host = "localhost", port = 5432,
                 user = "admin", password = "admin")
dbExistsTable(connection, "clicks_train")  # confirm the tables are accessible
## [1] TRUE

####################################################################################

########################### QUERY THE DATABASE ########################################

# 1)
# try to find features related to ad_id.
# here we join click_train and promoted-content with ad_id.

# look at clicks_train first
getClicksTrain="select * from clicks_train limit 10 "
clicks_train = dbGetQuery(connection, getClicksTrain)
clicks_train
##    display_id  ad_id clicked
## 1           1  42337       0
## 2           1 139684       0
## 3           1 144739       1
## 4           1 156824       0
## 5           1 279295       0
## 6           1 296965       0
## 7           2 125211       0
## 8           2 156535       0
## 9           2 169564       0
## 10          2 308455       1
# look at promoted_content next
getPromotedContent="select * from promoted_content limit 10"
promoted_content = dbGetQuery(connection, getPromotedContent)
promoted_content
##    ad_id document_id campaign_id advertiser_id
## 1      1        6614           1             7
## 2      2      471467           2             7
## 3      3        7692           3             7
## 4      4      471471           2             7
## 5      5      471472           2             7
## 6      6       12736           1             7
## 7      7       12808           1             7
## 8      8      471477           2             7
## 9      9       13379           1             7
## 10    10       13885           1             7
# join click_train and promoted-content with ad_id new table
# 500k apears to be stable with rstudio.
join_query = "
select t.display_id, t.ad_id, t.clicked, d.document_id,
d.topic_id, d.confidence_level
from clicks_train t, promoted_content p, documents_topics d
where t.ad_id = p.ad_id and p.document_id = d.document_id
limit 500000;"
merged_table=dbGetQuery(connection, join_query)
head(merged_table, 3)
##   display_id  ad_id clicked document_id topic_id confidence_level
## 1          6 180923       0     1151028       74       0.09882334
## 2          6 180923       0     1151028       16       0.17285361
## 3         48 180923       0     1151028       74       0.09882334
dim(merged_table)
## [1] 500000      6

####################################################################################

########################### Create new table -- write new table ########################################

dbWriteTable(connection, "merged_table", merged_table, row.names=FALSE)
## Warning in postgresqlWriteTable(conn, name, value, ...): table merged_table
## exists in database: aborting assignTable
## [1] FALSE
# look at the new table
getMergedTable="select * from merged_table limit 500000"

new_table = dbGetQuery(connection, getMergedTable)

# list the structure of mydata
str(new_table)
## 'data.frame':	500000 obs. of  6 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...

####################################################################################

############################################################################## 3) Models and response Models to consider: Start with the classification problem first - easier problem - start with decision trees, random forests and bagging Then switch to regression

require(tree)
## Loading required package: tree
str(new_table)
## 'data.frame':	500000 obs. of  6 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...
hist(new_table$confidence_level)

plot of chunk unnamed-chunk-4

hist(new_table$clicked)

plot of chunk unnamed-chunk-4

hist(new_table$display_id)

plot of chunk unnamed-chunk-4

hist(new_table$ad_id)

plot of chunk unnamed-chunk-4

hist(new_table$document_id)

plot of chunk unnamed-chunk-4

hist(new_table$topic_id)

plot of chunk unnamed-chunk-4

attach(new_table)
confidence20=ifelse(new_table$confidence_level>=0.2,"Yes","No")
str(confidence20)
##  chr [1:500000] "No" "No" "No" "Yes" "No" "No" "No" ...
new_table=data.frame(new_table, confidence20)
tree.local.train.c20=tree(confidence20~.-new_table$confidence_level,data=new_table)
summary(tree.local.train.c20)
##
## Classification tree:
## tree(formula = confidence20 ~ . - new_table$confidence_level,
##     data = new_table)
## Variables actually used in tree construction:
## [1] "confidence_level"
## Number of terminal nodes:  2
## Residual mean deviance:  0 = 0 / 5e+05
## Misclassification error rate: 0 = 0 / 5e+05
plot(tree.local.train.c20)
text(tree.local.train.c20,pretty=0)

plot of chunk unnamed-chunk-4 ##############################################################################

########################### create a training set ###########################################

# list the structure of mydata
str(new_table)
## 'data.frame':	500000 obs. of  7 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...
##  $ confidence20    : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
partition_size = floor(0.80 * nrow(new_table)) ## 80% of the sample size
str(partition_size)
##  num 4e+05
set.seed(123) ## set the seed to make your partition reproductible
partition_index <- sample(seq_len(nrow(new_table)), size = partition_size)
local_train_set <- new_table[partition_index, ]
local_test_set <- new_table[-partition_index, ]

# list the structure of mydata
str(local_test_set)
## 'data.frame':	100000 obs. of  7 variables:
##  $ display_id      : int  13339520 13339520 13339539 13339605 13339805 13339805 13339945 13340028 13340034 13340092 ...
##  $ ad_id           : int  389774 389774 400648 509131 528174 528174 90197 528174 180923 180923 ...
##  $ clicked         : int  1 1 0 0 0 0 0 1 0 0 ...
##  $ document_id     : int  1804537 1804537 1804537 2436524 2442282 2442282 483905 2442282 1151028 1151028 ...
##  $ topic_id        : int  184 97 66 197 260 124 140 260 16 16 ...
##  $ confidence_level: num  0.11432 0.21158 0.08875 0.02792 0.00818 ...
##  $ confidence20    : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
# print first 10 rows of mydata
#head(mydata, n=10)

##############################################################################

You can’t perform that action at this time.