In [1]:
# Imports
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [2]:
# Toggle based on hardware - if low memory laptop set to True else False
poor = True

(It was earlier observed that Topics of some documents are not known, Cateogires of some documents are not known, Entities of some documents are not known
)

In [3]:
documents_meta = pd.read_csv('./data/documents_meta.csv',  dtype={"document_id": int, "source_id": object, "publisher_id": object, "publish_time": object})

Source Id, Publisher Id and Publish time of all documents <b>are</b> known

In [4]:
#Init
events_df = pd.read_csv('./data/events.csv',  dtype={"display_id": int, "uuid": str, "document_id": int, "timestamp": int, "platform": str, "geo_location": str})

clicks_train = pd.read_csv('./data/clicks_train.csv',  dtype={"display_id": int, "ad_id": int, "clicked": int})
if poor:
    clicks_train = clicks_train.head(1001) #1002 is a different display id.
clicks_test = pd.read_csv('./data/clicks_test.csv',  dtype={"display_id": int, "ad_id": int})

clicks_train = pd.merge(clicks_train, events_df, on='display_id')
clicks_test  = pd.merge(clicks_test, events_df, on='display_id')

del(events_df) # save memory

## Add target doucment id info from promoted content
clicks_train = clicks_train.rename(index=str, columns={"document_id": "source_document_id"})
clicks_test = clicks_test.rename(index=str, columns={"document_id": "source_document_id"})
promoted_content = pd.read_csv('./data/promoted_content.csv',  dtype={"ad_id": int, "document_id": int, "campaign_id": object, "advertiser_id": object})
clicks_train = pd.merge(clicks_train, promoted_content, on='ad_id')
clicks_test = pd.merge(clicks_test, promoted_content, on='ad_id')
del(promoted_content)


In [5]:
#Lets not worry about these for now
def cleanup(clicks):
    clicks.drop('geo_location', axis=1, inplace=True)
    clicks.drop('platform', axis=1, inplace=True)
    clicks.drop('timestamp', axis=1, inplace=True)
    clicks.drop('campaign_id', axis=1, inplace=True)
    clicks.drop('advertiser_id', axis=1, inplace=True)
    clicks.drop('uuid', axis=1, inplace=True)

In [6]:
cleanup(clicks_test)
cleanup(clicks_train)
clicks_train.drop('ad_id', axis=1, inplace=True)
clicks_train.drop('display_id', axis=1, inplace=True)

Bringing in the features analysed in FeatureAnalysis.pynb earlier...

In [7]:
def confidence_max(topics):
    topics_grouped = topics.groupby(['document_id'], sort=False).agg({'confidence_level':'max'})
    topics_grouped = topics_grouped.reset_index()
    topics_grouped = topics_grouped.rename(columns={'confidence_level':'confidence_max'})
    topics = pd.merge(topics, topics_grouped, how='left', on=['document_id'])
    del(topics_grouped)
    topics = topics[topics['confidence_level'] == topics['confidence_max']]
    topics.drop('confidence_level', axis=1, inplace=True)
    topics.drop('confidence_max', axis=1, inplace=True)
    topics = topics.drop_duplicates(subset=['document_id'])
    return topics
def most_frequent(topics, groupbykey, topn):
    top_topics = topics.groupby(groupbykey, sort=False)['document_id'].count().sort_values(ascending=False).head(topn).index
    return topics[topics[groupbykey].isin(top_topics)]
def convert_to_dummies(topics, key):
    topics = pd.concat([topics, pd.get_dummies(topics[key])], axis=1, join='inner')
    topics.drop(key, axis=1, inplace=True)
    #topics = topics.groupby(by='document_id', sort=False).agg(sum).reset_index() #Combine confidence level in one row
    return topics
def featurize(topics, key, topn):
    return convert_to_dummies(most_frequent(confidence_max(topics), key, topn), key).to_sparse(fill_value=0);    

In [8]:
categories = pd.read_csv('./data/documents_categories.csv',  dtype={"document_id": int, "category_id": int, "confidence_level": float})

#Crate Category dummies
categories = featurize(categories, 'category_id', 5)

# Hydrate source document categories
clicks_train = pd.merge(clicks_train, categories, how = 'left', left_on = 'source_document_id', right_on = 'document_id')
clicks_test =  pd.merge(clicks_test,  categories, how = 'left', left_on = 'source_document_id', right_on = 'document_id')

clicks_train.drop('document_id_y', axis=1, inplace=True)
clicks_train.rename(columns={'document_id_x':'document_id'}, inplace=True)
clicks_test.drop('document_id_y', axis=1, inplace=True)
clicks_test.rename(columns={'document_id_x':'document_id'}, inplace=True)


# Hydrate destination document categories
clicks_train = pd.merge(clicks_train, categories, how = 'left', left_on = 'document_id', right_on = 'document_id')
clicks_test  = pd.merge(clicks_test, categories, how = 'left', left_on = 'document_id', right_on = 'document_id')

clicks_train.fillna(0, inplace=True) #NaN treated as not belonging to any Category (unknown category)
clicks_test.fillna(0, inplace=True)

del(categories)

In [9]:
entities = pd.read_csv('./data/documents_entities.csv',  dtype={"document_id": int, "entity_id": object, "confidence_level": float})

#Crate Entity dummies
entities = featurize(entities, 'entity_id', 5)

# Hydrate source document categories
clicks_train = pd.merge(clicks_train, entities, how = 'left', left_on = 'source_document_id', right_on = 'document_id')
clicks_test =  pd.merge(clicks_test,  entities, how = 'left', left_on = 'source_document_id', right_on = 'document_id')

clicks_train.drop('document_id_y', axis=1, inplace=True)
clicks_train.rename(columns={'document_id_x':'document_id'}, inplace=True)
clicks_test.drop('document_id_y', axis=1, inplace=True)
clicks_test.rename(columns={'document_id_x':'document_id'}, inplace=True)

# Hydrate destination document categories
clicks_train = pd.merge(clicks_train, entities, how = 'left', left_on = 'document_id', right_on = 'document_id')
clicks_test  = pd.merge(clicks_test, entities, how = 'left', left_on = 'document_id', right_on = 'document_id')
clicks_train.fillna(0, inplace=True) #NaN treated as not belonging to any Category (unknown category)
clicks_test.fillna(0, inplace=True)

del(entities)

In [None]:
topics = pd.read_csv('./data/documents_topics.csv',  dtype={"document_id": int, "topic_id": int, "confidence_level": float})

#Crate Title dummies
topics = featurize(topics, 'topic_id', 5)

# Hydrate source document categories
clicks_train = pd.merge(clicks_train, topics, how = 'left', left_on = 'source_document_id', right_on = 'document_id')
clicks_test =  pd.merge(clicks_test,  topics, how = 'left', left_on = 'source_document_id', right_on = 'document_id')

clicks_train.drop('document_id_y', axis=1, inplace=True)
clicks_train.rename(columns={'document_id_x':'document_id'}, inplace=True)
clicks_test.drop('document_id_y', axis=1, inplace=True)
clicks_test.rename(columns={'document_id_x':'document_id'}, inplace=True)

# Hydrate destination document categories
clicks_train = pd.merge(clicks_train, topics, how = 'left', left_on = 'document_id', right_on = 'document_id')
clicks_test  = pd.merge(clicks_test, topics, how = 'left', left_on = 'document_id', right_on = 'document_id')
clicks_train.fillna(0, inplace=True) #NaN treated as not belonging to any Category (unknown category)
clicks_test.fillna(0, inplace=True)

del(topics)

In [21]:
clicks_test.drop('source_document_id', axis=1, inplace=True)
clicks_train.drop('source_document_id', axis=1, inplace=True)
clicks_train.drop('document_id', axis=1, inplace=True)
clicks_test.drop('document_id', axis=1, inplace=True)

In [24]:
from sklearn import tree

model = tree.DecisionTreeClassifier(criterion='gini')

Y = clicks_train['clicked']

clicks_train.drop('clicked', axis=1, inplace=True)

model.fit(clicks_train, Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [25]:
model.score(clicks_train, Y)

0.81518481518481523

In [None]:
clicks_test[clicks_test.columns.difference(['display_id', 'ad_id'])].head()

In [None]:
clicks_test['prob'] = model.predict_proba(clicks_test.columns.difference(['display_id', 'ad_id']))[:,1]

In [None]:
clicks_test.sort_values(['display_id', 'probs'], inplace=True, ascending=[True, False])
clicks_test.drop('probs', axis=1, inplace=True)
clicks_test = clicks_test.groupby(by='display_id', sort=False).aggregate(lambda x: ' '.join([str(ff) for ff in x]))
clicks_test.to_csv('submission.csv', index = True)

In [None]:
 = predicted

probs = model.predict_proba(clicks_test)[:,1]

org_train['probs'] = probs

org_train.sort_values(['display_id', 'probs'], inplace=True, ascending=[True, False] )

Y_ads = org_train[ org_train.clicked == 1 ].ad_id.values.reshape(-1,1)

P_ads = org_train.groupby(by='display_id', sort=False).ad_id.apply( lambda x: x.values ).values

from ml_metrics import mapk

score = mapk( Y_ads, P_ads, 12 )

In [None]:
print("MAP: %.12f" % score)

In [None]:
result = org_train.groupby(['display_id']).first()

TP = len(result[result['clicked'] == 1])

FP = len(result[result['clicked'] != 1])

print "Simple Precision = %.2f"%(TP / float(TP + FP))