# Outbrain click prediction data reduction

Our goal is to create a smaller dataset that can be easily processed with pandas for instance, for getting quick insights, getting familliar with the data and rapid feature prototyping.

I downloaded the data files from https://www.kaggle.com/c/outbrain-click-prediction/data?select=page_views.csv.zip,

and uploaded to our HDFS. using a pySpark kernel.

In [2]:
from pyspark.sql.types import IntegerType

events_df = spark.read.csv("/user/ykarni/kaggle/events.csv", header=True)
events_df = events_df.withColumn('timestamp', events_df['timestamp'].cast(IntegerType()))

In [40]:
events_df.columns

['display_id', 'uuid', 'document_id', 'timestamp', 'platform', 'geo_location']

In [3]:
events_df.show(5)

+----------+--------------+-----------+---------+--------+------------+
|display_id|          uuid|document_id|timestamp|platform|geo_location|
+----------+--------------+-----------+---------+--------+------------+
|         1|cb8c55702adb93|     379743|       61|       3|   US>SC>519|
|         2|79a85fa78311b9|    1794259|       81|       2|   US>CA>807|
|         3|822932ce3d8757|    1179111|      182|       2|   US>MI>505|
|         4|85281d0a49f7ac|    1777797|      234|       2|   US>WV>564|
|         5|8d0daef4bf5b56|     252458|      338|       2|       SG>00|
+----------+--------------+-----------+---------+--------+------------+
only showing top 5 rows



In [31]:
reduced_events_df = events_df.filter('timestamp <  100000000')

In [100]:
reduced_events_df.write.option("header", True).csv('/user/ykarni/kaggle/reduced_events.csv')

In [32]:
reduced_events_df.count()

1856133

In [50]:
reduced_events_display_ids_df = reduced_events_df.select('display_id').distinct()

In [69]:
reduced_events_display_ids_df.count()

1856133

In [51]:
reduced_events_doc_ids_df = reduced_events_df.select('document_id').distinct()

In [70]:
reduced_events_doc_ids_df.count()

184824

In [74]:
reduced_events_uuids_df = reduced_events_df.select('uuid').distinct()

In [75]:
reduced_events_uuids_df.count()

1749023

In [52]:
document_categories_df = spark.read.csv("/user/ykarni/kaggle/documents_categories.csv", header=True)

In [79]:
reduced_document_categories_df = document_categories_df.join(reduced_events_doc_ids_df, ['document_id'], how="inner")

In [80]:
reduced_document_categories_df.count()

354347

In [101]:
reduced_document_categories_df.write.option("header", True).csv('/user/ykarni/kaggle/reduced_document_categories.csv', header=True)

In [55]:
document_categories_df.columns

['document_id', 'category_id', 'confidence_level']

In [56]:
document_categories_df.select("category_id").distinct().count()

97

In [57]:
promoted_content_df = spark.read.csv("/user/ykarni/kaggle/promoted_content.csv", header=True)

In [81]:
reduced_promoted_content_df = promoted_content_df.join(reduced_events_doc_ids_df, ['document_id'], how="inner")

In [82]:
reduced_promoted_content_df.count()

37449

In [102]:
reduced_promoted_content_df.write.option("header", True).csv("/user/ykarni/kaggle/reduced_promoted_content.csv")

In [60]:
promoted_content_df.columns

['ad_id', 'document_id', 'campaign_id', 'advertiser_id']

In [61]:
documents_meta_df = spark.read.csv("/user/ykarni/kaggle/documents_meta.csv", header=True)

In [62]:
documents_meta_df.columns

['document_id', 'source_id', 'publisher_id', 'publish_time']

In [86]:
reduced_documents_meta_df = documents_meta_df.join(reduced_events_doc_ids_df, ['document_id'], how="inner")

In [87]:
documents_meta_df.count()

2999334

In [88]:
reduced_documents_meta_df.count()

184824

In [103]:
reduced_documents_meta_df.write.option("header", True).csv("/user/ykarni/kaggle/reduced_documents_meta.csv")

In [89]:
clicks_train_df = spark.read.csv("/user/ykarni/kaggle/clicks_train.csv", header=True)

In [90]:
clicks_train_df.columns

['display_id', 'ad_id', 'clicked']

In [91]:
clicks_train_df.count()

87141731

In [92]:
reduced_clicks_train_df = clicks_train_df.join(reduced_events_display_ids_df, ['display_id'], how="inner")

In [93]:
reduced_clicks_train_df.count()

8159306

In [104]:
reduced_clicks_train_df.write.option("header", True).csv("/user/ykarni/kaggle/reduced_clicks_train.csv")

In [94]:
page_views_df = spark.read.csv("/user/ykarni/kaggle/page_views.csv", header=True)
page_views_df = page_views_df.withColumn('timestamp', page_views_df['timestamp'].cast(IntegerType()))

In [95]:
page_views_df.columns

['uuid',
 'document_id',
 'timestamp',
 'platform',
 'geo_location',
 'traffic_source']

In [96]:
page_views_df.show(5)

+--------------+-----------+---------+--------+------------+--------------+
|          uuid|document_id|timestamp|platform|geo_location|traffic_source|
+--------------+-----------+---------+--------+------------+--------------+
|1fd5f051fba643|        120| 31905835|       1|          RS|             2|
|8557aa9004be3b|        120| 32053104|       1|       VN>44|             2|
|c351b277a358f0|        120| 54013023|       1|       KR>12|             1|
|8205775c5387f9|        120| 44196592|       1|       IN>16|             2|
|9cb0ccd8458371|        120| 65817371|       1|   US>CA>807|             2|
+--------------+-----------+---------+--------+------------+--------------+
only showing top 5 rows



In [97]:
reduced_page_views_df = page_views_df.join(reduced_events_uuids_df, ['uuid'], how="inner")

In [106]:
reduced_page_views_df.write.option("header", True).csv("/user/ykarni/kaggle/reduced_page_views.csv")

In [98]:
reduced_page_views_df.count()

21489125