In [1]:
# Imports
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import gc
%matplotlib inline

In [7]:
# Toggle based on hardware - if low memory laptop set to True else False
poor = True

(It was earlier observed that Topics of some documents are not known, Cateogires of some documents are not known, Entities of some documents are not known
)

In [8]:
documents_meta = pd.read_csv('./data/documents_meta.csv',  dtype={"document_id": int, "source_id": object, "publisher_id": object, "publish_time": object})

Source Id, Publisher Id and Publish time of all documents <b>are</b> known

In [9]:
#Init
def add_documents_ids(clicks):
    events_df = pd.read_csv('./data/events.csv',  dtype={"display_id": int, "uuid": str, "document_id": int, "timestamp": int, "platform": str, "geo_location": str})
    events_df = events_df[events_df.platform != '\\N']
    events_df.platform = events_df.platform.astype(int)
    clicks    = pd.merge(clicks, events_df, on='display_id')
    del(events_df) # save memory
    ## Add target doucment id info from promoted content
    clicks = clicks.rename(index=str, columns={"document_id": "source_document_id"})
    promoted_content = pd.read_csv('./data/promoted_content.csv',  dtype={"ad_id": int, "document_id": int, "campaign_id": object, "advertiser_id": object})
    clicks = pd.merge(clicks, promoted_content, on='ad_id')
    del(promoted_content)
    return cleanup(clicks)

def add_reg_ctr(clicks):
    reg_ctr = pd.read_csv('./reg_ctr.csv',dtype={"ad_id":int, "reg_ctr":float})
    clicks = pd.merge(clicks, reg_ctr, how = 'left', on = 'ad_id')
    clicks['reg_ctr'].fillna(0, inplace=True)
    del(reg_ctr)
    return clicks

def add_display_size(clicks):
    clicks['display_size'] = clicks.groupby(['display_id'], sort=False)['ad_id'].transform('count')
    return clicks

    #Lets not worry about these for now
def cleanup(clicks):
    clicks.drop('geo_location', axis=1, inplace=True)
    clicks.drop('timestamp', axis=1, inplace=True)
    clicks.drop('campaign_id', axis=1, inplace=True)
    clicks.drop('advertiser_id', axis=1, inplace=True)
    clicks.drop('uuid', axis=1, inplace=True)
    return clicks

Bringing in the features analysed in FeatureAnalysis.pynb earlier...

In [5]:
def confidence_max(topics):
    topics_grouped = topics.groupby(['document_id'], sort=False).agg({'confidence_level':'max'})
    topics_grouped = topics_grouped.reset_index()
    topics_grouped = topics_grouped.rename(columns={'confidence_level':'confidence_max'})
    topics = pd.merge(topics, topics_grouped, how='left', on=['document_id'])
    del(topics_grouped)
    topics = topics[topics['confidence_level'] == topics['confidence_max']]
    topics.drop('confidence_level', axis=1, inplace=True)
    topics.drop('confidence_max', axis=1, inplace=True)
    topics = topics.drop_duplicates(subset=['document_id'])
    return topics
def most_frequent(topics, groupbykey, topn):
    top_topics = topics.groupby(groupbykey, sort=False)['document_id'].count().sort_values(ascending=False).head(topn).index
    return topics[topics[groupbykey].isin(top_topics)]
def convert_to_dummies(topics, key):
    topics = pd.concat([topics, pd.get_dummies(topics[key])], axis=1, join='inner')
    topics.drop(key, axis=1, inplace=True)
    #topics = topics.groupby(by='document_id', sort=False).agg(sum).reset_index() #Combine confidence level in one row
    return topics
def featurize_document_meta(clicks, topics, key, topn):
    topics = convert_to_dummies(most_frequent(confidence_max(topics), key, topn), key).to_sparse(fill_value=0);
    # Hydrate source document categories/topics/entities
    clicks = pd.merge(clicks, topics, how = 'left', left_on = 'source_document_id', right_on = 'document_id')
    clicks.drop('document_id_y', axis=1, inplace=True)
    clicks.rename(columns={'document_id_x':'document_id'}, inplace=True)
    # Hydrate destination document categories/topics/entities
    clicks = pd.merge(clicks, topics, how = 'left', left_on = 'document_id', right_on = 'document_id')
    clicks.fillna(0, inplace=True) #NaN treated as not belonging to any Category (unknown category)
    return clicks  
    
def featurize(clicks):
    categories = pd.read_csv('./data/documents_categories.csv',  dtype={"document_id": int, "category_id": int, "confidence_level": float})
    #Create Category dummies
    clicks = featurize_document_meta(clicks, categories, 'category_id', 5)
    del(categories)
    #Create Entity dummies
    entities = pd.read_csv('./data/documents_entities.csv',  dtype={"document_id": int, "entity_id": object, "confidence_level": float})
    clicks = featurize_document_meta(clicks, entities, 'entity_id', 5)
    del(entities)
    #Create Topics dummies
    topics = pd.read_csv('./data/documents_topics.csv',  dtype={"document_id": int, "topic_id": int, "confidence_level": float})
    clicks = featurize_document_meta(clicks, topics, 'topic_id', 5)
    del(topics)
    clicks.drop('source_document_id', axis=1, inplace=True)
    clicks.drop('document_id', axis=1, inplace=True)
    return clicks
    

In [10]:
clicks_train = pd.read_csv('./data/clicks_train.csv',  dtype={"display_id": int, "ad_id": int, "clicked": int})
if poor:
    clicks_train = clicks_train.head(1001) #1002 is a different display id.
clicks_train = add_documents_ids(clicks_train)

clicks_train = add_reg_ctr(clicks_train)
clicks_train = add_display_size(clicks_train)

#clicks_train.drop('ad_id', axis=1, inplace=True)
#clicks_train.drop('display_id', axis=1, inplace=True)

#clicks_train = featurize(clicks_train)
clicks_train = clicks_train.to_sparse(fill_value=0)


In [11]:
clicks_train.drop('ad_id', axis=1, inplace=True)
clicks_train.drop('display_id', axis=1, inplace=True)
clicks_train.drop('source_document_id', axis=1, inplace=True)
clicks_train.drop('document_id', axis=1, inplace=True)

In [12]:
gc.collect()

259

In [9]:
#from sklearn import tree
#model = tree.DecisionTreeClassifier(criterion='gini')

In [13]:
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier()

In [14]:
X = clicks_train[clicks_train.columns.difference(['clicked']).values]
Y = clicks_train['clicked'].to_dense()
#model.fit(X, Y)

In [None]:
#model.get_params()

In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = {   'n_estimators': [3], #[100, 1000], 
                 'min_samples_leaf': [0.20],
                 'max_features' : [None],
                 'criterion': ['gini', 'entropy'],
                  'n_jobs' : [4]
             }
grid_clf = GridSearchCV(model, param_grid, cv=2)

In [16]:
grid_clf.fit(X, Y)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [3], 'max_features': [None], 'n_jobs': [4], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
grid_clf.best_params_

{'criterion': 'gini',
 'max_features': None,
 'min_samples_leaf': 0.2,
 'n_estimators': 3,
 'n_jobs': 4}

In [18]:
grid_clf.best_score_

0.80619380619380621

In [19]:
# Choose the best model that GridSearch found
model = grid_clf.best_estimator_

In [20]:
del(clicks_train)
gc.collect()

905

In [21]:
clicks_test = pd.read_csv('./data/clicks_test.csv',  dtype={"display_id": int, "ad_id": int})

In [22]:
clicks_test = add_documents_ids(clicks_test)
clicks_test = add_reg_ctr(clicks_test)
clicks_test = add_display_size(clicks_test).to_sparse(fill_value=0)
#clicks_test = featurize(clicks_test).to_sparse(fill_value=0)

In [25]:
clicks_test.head(2)

Unnamed: 0,display_id,ad_id,platform,reg_ctr,display_size
0,16874594,66758,3,0.065994,6
1,16874737,66758,3,0.065994,9


In [24]:
clicks_test.drop('source_document_id', axis=1, inplace=True)
clicks_test.drop('document_id', axis=1, inplace=True)

In [26]:
clicks_test['prob'] = model.predict_proba(clicks_test[clicks_test.columns.difference(['display_id', 'ad_id']).values])[:,1]

In [27]:
clicks_test = clicks_test[['display_id','prob','ad_id']]

In [28]:
clicks_test.sort_values(['display_id', 'prob'], inplace=True, ascending=[True, False])
clicks_test.drop('prob', axis=1, inplace=True)

In [None]:
#clicks_test = clicks_test.groupby(by='display_id', sort=False).aggregate(lambda x: ' '.join([str(ff) for ff in x]))

In [66]:
def f(df):
         keys,values=df.values.T
         ukeys,index=np.unique(keys,True)
         arrays=np.split(values,index[1:])
         df2=pd.DataFrame({'display_id':ukeys,'ad_id':[' '.join([str(ff) for ff in a]) for a in arrays]})
         return df2
clicks_test = f(clicks_test)[['display_id','ad_id']]

In [67]:
clicks_test.to_csv('submission_rf.csv', index = True)

In [None]:
 = predicted

probs = model.predict_proba(clicks_test)[:,1]

org_train['probs'] = probs

org_train.sort_values(['display_id', 'probs'], inplace=True, ascending=[True, False] )

Y_ads = org_train[ org_train.clicked == 1 ].ad_id.values.reshape(-1,1)

P_ads = org_train.groupby(by='display_id', sort=False).ad_id.apply( lambda x: x.values ).values

from ml_metrics import mapk

score = mapk( Y_ads, P_ads, 12 )

In [None]:
print("MAP: %.12f" % score)

In [None]:
result = org_train.groupby(['display_id']).first()

TP = len(result[result['clicked'] == 1])

FP = len(result[result['clicked'] != 1])

print "Simple Precision = %.2f"%(TP / float(TP + FP))