In [1]:
import findspark
findspark.init()
import pandas as pd
from pyspark import SparkConf, SparkContext
import os

src_bucket = 'digital-trend-raw-data'
medium_path = 'snowplow/2018-10'

conf = SparkConf()
sc = SparkContext.getOrCreate(conf=conf)

In [2]:
def split_url(url):
    info = url.split('.com')
    if len(info) == 2:
        return info[1]
    else:
        return ' '
    
def to_list(x):
    res = []
    for i in x:
        res.append(i)
    return res

In [7]:
# read posts table and reformat url
if not os.path.isfile('./wordpress_combine.csv'):
    !gsutil -m cp gs://digital-trend-data-unzipped/wordpress/wordpress_combine.csv ./wordpress_combine.csv
posts_df = sc.textFile('./wordpress_combine.csv')
posts = posts_df.map(lambda line: line.split(',')).map(lambda info: (split_url(info[4]) + '/', info[0]))
posts_broadcast = sc.broadcast(posts.collectAsMap())
posts.take(5)

[(' /', '0'),
 ('/travel/best-travel-clothes-brands/', '1002075'),
 ('/celular/como-grabar-pantalla-celular/', '56635'),
 ('/computing/ssd-vs-hdd/', '396726'),
 ('/movies/avengers-endgame-trailer-release-date-news/', '2199706')]

In [15]:
N1 = [0]  # range 0 - 3
N2 = [3]    # range 0 - 11
user_clicklist_combine = sc.parallelize('')
for idx1 in N1:
    for idx2 in N2:
        file_name = format(idx1, '04d') + '_part_' + format(idx2, '02d') + '.gz'
        if not os.path.isfile('./' + file_name):
            !gsutil -m cp gs://{src_bucket}/{medium_path}/{file_name} ./{file_name}
        # read file into rdd
        # extract useful info
        # gs://digital-trend-raw-data/snowplow/2019-03/0000_part_00
        click_df = sc.textFile(file_name)
        clicks = click_df.map(lambda line: line.split('","')).map(lambda info: (info[15], info[18], info[35], info[3]))
        print(clicks.take(10))
 
        # match user clicks url with post id
        clicks_with_id = clicks.map(lambda info: (info[0] + '|' + info[1], posts_broadcast.value.get(info[2]), info[3]))
        print(clicks_with_id.take(10))
        
        # filter user clicks
        user_clicks = clicks_with_id.filter(lambda info: info[0] is not None and info[1] is not None and info[2] is not None and info[0] is not '' and info[1] is not '' and info[2] is not '')
        print(user_clicks.take(10))
        
        # group user clicks
        # user_clicklist = user_clicks.distinct().groupByKey().mapValues(lambda x: to_list(x))
        # print(user_clicklist.count())
        
        # combine with old one
        user_clicklist_combine = user_clicklist_combine.union(user_clicks)
        print(user_clicklist_combine.take(10))
        
        

[('XkhLEspKjz6MBjnqdpq7GMCydmtqLd-xD-db', 'US', '/home/little-girl-wants-alexa-to-play-baby-shark/', '2018-10-30 12:04:00'), ('HD3nALuG3lJgxw06hzjyu-U8xJUjTmAnfTWc', 'PK', '/mobile/how-to-record-the-screen-on-an-android-device/', '2018-10-30 12:04:01'), ('f5e0df66-f262-46cd-9454-eaa6aad97930', 'US', '/photography/best-space-photos/', '2018-10-30 12:04:01'), ('a1bb2623-29ad-4d53-9bd5-86568811d879', 'US', '/mobile/how-to-buy-apple-watch-series-4/', '2018-10-30 12:04:01'), ('507cc8d7-9c8f-4882-bb7d-c0e71835de1f', 'ES', '/inteligente/amazon-echo-y-echo-plus/', '2018-10-30 12:04:01'), ('ba902b4e-d5b5-4002-a366-9ec0beb5e958', 'PR', '/celular/noticias-oneplus-6t/', '2018-10-30 12:04:01'), ('47123b5d-96ad-4c65-af4b-b29e6a8a96b7', 'GB', '/mobile/iphone-xr-more-powerful-than-you-think/', '2018-10-30 12:04:01'), ('626b1812-a7ca-4d25-873b-5f47c134faa1', 'US', '/home/best-coffee-makers/', '2018-10-30 12:04:01'), ('63b45117-eada-4ce2-9b72-413963b9bf6a', 'IN', '/wearables/best-fitness-trackers/', '20

In [18]:
user_clicklist_output = user_clicklist_combine.distinct().filter(lambda info: info[1] is not None)
print(user_clicklist_output.take(3))
print(user_clicklist_output.count())

[('507cc8d7-9c8f-4882-bb7d-c0e71835de1f|ES', '74649', '2018-10-30 12:04:01'), ('7db35ae5-237c-4b15-a090-7b318a3018d6|US', '108446', '2018-10-30 12:04:03'), ('eb50e531-a1a3-4157-aaa9-bba6b2b80ded|EE', '2224743', '2018-10-30 12:04:03')]
1039284


In [22]:
user_clicklist_output.saveAsTextFile('output')