In [1]:
import pandas as pd
import os
#from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit

In [2]:
def merge_chunks(right_on_list, input_file, output_file, rename_columns={}, drop_columns=[], chunksize=10**7, how='left', index=False, usecols=None):
    first_time = True
    size = 0
    i = 0
    
    f = open(output_file, 'w')

    for left in pd.read_csv(input_file, usecols=usecols, chunksize=chunksize):
        for right, on in right_on_list:
            left = left.merge(right, how=how, on=on)
            
        if len(rename_columns)>0:
            left.rename(columns=rename_columns, inplace=True)
            
        if len(drop_columns)>0:
            left.drop(drop_columns, axis=1, inplace = True)
    
        if first_time:
            left.to_csv(f, index = index, header=True)
            f.close()
            f = open(output_file, 'a')
            first_time = False
        else:
            left.to_csv(f, index = index, header=False)
    
        size+=chunksize
        i+=1
        print i, size
            
    print 'Done'
    
    f.close()

## Load / preview common tables to be joined

In [None]:
events_filename = "../generated/final/events.csv"

In [None]:
events = pd.read_csv(events_filename, nrows=10)
events.head()

In [None]:
adsPerDisplay = pd.read_csv("../generated/adsPerDisplay.csv")
adsPerDisplay.count()

## Join training

### Step 1

In [None]:
clicks_train = pd.read_csv("../download/clicks_train.csv")
clicks_train.count()

In [None]:
right_on_list = [(clicks_train, 'display_id'), (adsPerDisplay, 'display_id')] 
input_file = events_filename
output_file = "../generated/final/events_clicks_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='inner'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 2

Can be run after feature_base_3

In [None]:
events_CTR_train = pd.read_csv("../generated/final/events_CTR_train.csv")
events_CTR_train.count()

In [None]:
right_on_list = [(events_CTR_train, ['display_id', 'ad_id'])] 
input_file = "../generated/final/events_clicks_train.csv"
output_file = "../generated/final/events_clicks_CTR_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='inner'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 3

In [None]:
ads = pd.read_csv("../download/promoted_content.csv", names=['ad_id', 'addoc_id', 'c', 'a'], header=0, usecols=['ad_id', 'addoc_id'])
ads.head()

In [None]:
right_on_list = [(ads, ['ad_id'])] 
input_file = "../generated/final/events_clicks_CTR_train.csv"
output_file = "../generated/final/events_clicks_CTR_addocs_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 4

Can be run after feature_base_4_topics

In [None]:
page_addoc_dist = pd.read_csv("../generated/final/page_addoc_topics_no_w_dist.csv")
page_addoc_dist.head()

In [None]:
page_addoc_dist.count()

In [None]:
right_on_list = [(page_addoc_dist, ['document_id', 'addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_train.csv"   
output_file = "../generated/final/events_clicks_CTR_addocs_dist_no_w_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 5

Can be run after feature_base_4_cats

In [None]:
page_addoc_dist = pd.read_csv("../generated/final/page_addoc_cats_dist.csv")
page_addoc_dist.head()

In [9]:
page_addoc_dist.count()

document_id      27700275
addoc_id         27700275
dist_cats        27700275
doc_norm_cats    27700275
ad_norm_cats     27700275
dtype: int64

In [None]:
right_on_list = [(page_addoc_dist, ['document_id', 'addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_dist_no_w_train.csv"   
output_file = "../generated/final/events_clicks_CTR_addocs_dist_cats_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [11]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

1 5000000
2 10000000
3 15000000
4 20000000
5 25000000
6 30000000
7 35000000
8 40000000
9 45000000
10 50000000
11 55000000
12 60000000
13 65000000
14 70000000
15 75000000
16 80000000
17 85000000
18 90000000
Done


In [12]:
pd.read_csv(output_file, nrows=5).head()

Unnamed: 0,display_id,document_id,timestamp,traffic_source,platform,day,hour,geo,ad_id,clicked,adsPerDisplay,clicksPerShows,addoc_id,dist,dist_cats,doc_norm_cats,ad_norm_cats
0,8,1330329,638,1.0,2.0,1,7,2765,95724,0,4,0.034785,1136820,0.537786,0.321621,0.133651,0.317691
1,8,1330329,638,1.0,2.0,1,7,2765,175694,0,4,0.202327,1392479,0.43021,0.025426,0.133651,0.131686
2,8,1330329,638,1.0,2.0,1,7,2765,280430,1,4,0.256853,1601512,0.24512,0.144678,0.133651,0.055399
3,8,1330329,638,1.0,2.0,1,7,2765,329774,0,4,0.052426,1652347,0.259652,0.158881,0.133651,0.085911
4,2657,1271490,185255,1.0,2.0,1,7,2765,70081,0,4,0.150145,933594,0.124739,0.139138,0.317707,0.179115


### Step 6

Can be run after feature_base_4_ents

In [None]:
page_addoc_dist = pd.read_csv("../generated/final/page_addoc_ents_dist.csv")
page_addoc_dist.head()

In [None]:
page_addoc_dist.count()

In [None]:
right_on_list = [(page_addoc_dist, ['document_id', 'addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_dist_cats_train.csv"   
output_file = "../generated/final/events_clicks_CTR_addocs_dist_ents_train.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 7

In [3]:
docs_time_doc = pd.read_csv("../generated/final/docs_publish_timestamp.csv")
docs_time_doc.count()

document_id          1976100
publish_timestamp    1976100
dtype: int64

In [4]:
docs_time_addoc = docs_time_doc.rename(columns={'document_id':'addoc_id'}).copy()
docs_time_addoc.count()

addoc_id             1976100
publish_timestamp    1976100
dtype: int64

In [5]:
right_on_list = [(docs_time_doc, ['document_id']), (docs_time_addoc, ['addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_dist_ents_train.csv"      
output_file = "../generated/final/events_clicks_CTR_addocs_dist_time_train.csv"   
rename_columns={'publish_timestamp_x':'doc_time', 'publish_timestamp_y':'addoc_time'}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [6]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

1 5000000
2 10000000
3 15000000
4 20000000
5 25000000
6 30000000
7 35000000
8 40000000
9 45000000
10 50000000
11 55000000
12 60000000
13 65000000
14 70000000
15 75000000
16 80000000
17 85000000
18 90000000
Done


In [7]:
pd.read_csv(output_file, nrows=5).head()

Unnamed: 0,display_id,document_id,timestamp,traffic_source,platform,day,hour,geo,ad_id,clicked,adsPerDisplay,clicksPerShows,addoc_id,dist,dist_cats,dist_ents,doc_time,addoc_time
0,8,1330329,638,1.0,2.0,1,7,2765,95724,0,4,0.034785,1136820,0.537786,1.254512,1.075178,-38602800000.0,-8661600000.0
1,8,1330329,638,1.0,2.0,1,7,2765,175694,0,4,0.202327,1392479,0.43021,0.098995,1.073816,-38602800000.0,-3045600000.0
2,8,1330329,638,1.0,2.0,1,7,2765,280430,1,4,0.256853,1601512,0.24512,1.304837,,-38602800000.0,
3,8,1330329,638,1.0,2.0,1,7,2765,329774,0,4,0.052426,1652347,0.259652,1.192209,0.702294,-38602800000.0,-626400000.0
4,2657,1271490,185255,1.0,2.0,1,7,2765,70081,0,4,0.150145,933594,0.124739,0.409647,,-5839200000.0,-32421600000.0


## Join testing

### Step 1

In [None]:
clicks_test = pd.read_csv("../download/clicks_test.csv")
clicks_test.count()

In [None]:
clicks_test.head()

In [None]:
ids = clicks_test.display_id

In [None]:
prev = 0
for nxt in ids:
    if nxt<prev:
        print nxt
        break
    prev = nxt    

In [None]:
right_on_list = [(clicks_test, 'display_id'), (adsPerDisplay, 'display_id')] 
input_file = events_filename
output_file = "../generated/final/events_clicks_test.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='inner'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 2

In [None]:
events_CTR_test = pd.read_csv("../generated/final/events_CTR_test.csv")
events_CTR_test.count()

In [None]:
right_on_list = [(events_CTR_test, ['display_id', 'ad_id'])] 
input_file = "../generated/final/events_clicks_test.csv"
output_file = "../generated/final/events_clicks_CTR_test.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='inner'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 3

In [None]:
ads = pd.read_csv("../download/promoted_content.csv", names=['ad_id', 'addoc_id', 'c', 'a'], header=0, usecols=['ad_id', 'addoc_id'])
ads.head()

In [None]:
right_on_list = [(ads, ['ad_id'])] 
input_file = "../generated/final/events_clicks_CTR_test.csv"
output_file = "../generated/final/events_clicks_CTR_addocs_test.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 4

Can be run after feature_base_4_topics

In [None]:
page_addoc_dist = pd.read_csv("../generated/final/page_addoc_topics_no_w_dist_test.csv")
page_addoc_dist.head()

In [None]:
page_addoc_dist.count()

In [None]:
right_on_list = [(page_addoc_dist, ['document_id', 'addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_test.csv"   
output_file = "../generated/final/events_clicks_CTR_addocs_dist_no_w_test.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

### Step 5

Can be run after feature_base_4_cats

In [None]:
page_addoc_dist = pd.read_csv("../generated/final/page_addoc_cats_dist_test.csv")
page_addoc_dist.head()

In [None]:
page_addoc_dist.count()

In [None]:
right_on_list = [(page_addoc_dist, ['document_id', 'addoc_id'])] 
input_file = "../generated/final/events_clicks_CTR_addocs_dist_no_w_test.csv"   
output_file = "../generated/final/events_clicks_CTR_addocs_dist_cats_test.csv"   
rename_columns={}
drop_columns=[]
chunksize=5 * 10**6
how='left'

In [None]:
merge_chunks(right_on_list, input_file, output_file, rename_columns, drop_columns, chunksize, how)

In [None]:
pd.read_csv(output_file, nrows=5).head()

## Validation splits

In [None]:
train_filename = "../generated/final/events_clicks_CTR_train.csv"

In [None]:
displays = pd.read_csv("../download/events.csv", usecols=['display_id'])
displays.count()

In [None]:
#kf = KFold(n_splits=5)
#train, test = kf.split(displays).next()
ss = ShuffleSplit(n_splits=1, test_size=0.25, random_state=0)
train_displays, test_displays = ss.split(displays).next()

print len(train_displays), len(test_displays)  #17340094 5780032

In [None]:
displays = displays.iloc[test_displays].copy()
displays['mark'] = [1]*len(displays)
displays.count()

In [None]:
train = pd.read_csv(train_filename, usecols=['display_id'])
train.count()

In [None]:
train_split = pd.merge(train, displays, on='display_id', how='left')
train_split.count()

In [None]:
train_index = train_split[train_split['mark'].isnull()].index
test_index = train_split[train_split['mark'].notnull()].index
len(train_index), len(test_index)

In [None]:
pd.DataFrame({'index':train_index}).to_csv("../generated/final/train_index.csv", index=False)
pd.DataFrame({'index':test_index}).to_csv("../generated/final/test_index.csv", index=False)

In [None]:
pd.read_csv("../generated/final/train_index.csv", nrows=10).head(10)