In [1]:
import wmfdata
spark = wmfdata.spark.get_session("yarn-large")
sc=spark.sparkContext

import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import timedelta, date
import matplotlib.pyplot as plt
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
import pandas as pd

%matplotlib inline


DESTINATION_FOLDER = "how_we_read_wikipedia_march/en"

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [2]:
aggregated_sessions = spark.read.parquet("{}/aggregated_sessions_final.parquet".format(DESTINATION_FOLDER))
aggregated_sessions

DataFrame[access_method: string, country_code: string, session: array<struct<actual_destination:string,http_status:string,local_time:timestamp,page_id:bigint,page_title:string,prev_load:bigint,referer:string>>, timezone: string, user_identifier: string, previews: array<struct<http_status:string,local_time:timestamp,page_title:string,preview_title:string>>]

In [3]:
from urllib.parse import urlparse
from urllib.parse import unquote

domains_wl = ['en.m.wikipedia.org', 'en.wikipedia.org']
main_page = "Main_Page"

class Tree:
    
    MAX_DEPTH = 300
    
    def __init__(self, referer, root, user_identifier, access_method, country_code):
        self.referer = referer
        self.user_identifier = user_identifier
        self.access_method = access_method
        self.tree = [root]
        self.country_code = country_code
    def get_as_dict(self):
        return {"user_identifier": self.user_identifier, 
                "access_method": self.access_method,
                "country_code": self.country_code,
                "referer": self.referer,
                "tree_size": self.size(),
                "tree": self.visit_tree(self.tree[0], 0)}
    def size(self):
        queued = [self.tree[0]]
        count = 0
        while len(queued)>0:
            node = queued.pop(0)
            count+=1
            for c in node.clicks:
                queued.append(c)
        return count
        
    def visit_tree(self, node, depth):
        result = {"page": node.page_load}
        clicks = []
        if depth < self.MAX_DEPTH:
            clicks = [self.visit_tree(c, depth+1) for c in node.clicks]
        if len(clicks)>0:
            result["clicks"] = clicks
        return result
    def __repr__(self):
        return "{} --> {}".format(self.referer, self.tree[0].page_load.get("page_title"))

class Node:
    def __init__(self, pl):
        self.page_load = pl
        self.clicks = []
        self.clicks_titles = set()
    def add_click(self, pl):
        self.clicks.append(pl)
        self.clicks_titles.add(pl.page_load.get("page_title"))
    def has_click(self, title):
        return title in self.clicks_titles


def get_trees(row):
    trees = []
    # Keep a pointer to the last time a page was open (title)
    pointers_to_last = {}
    
    for pl_idx in range(0,  len(row.session)):
        pl = row.session[pl_idx]
        
        #########################################
        # Get page title from the resolved redirect
        #########################################
        page_title = unquote(pl.actual_destination)
        
        # Get referer
        parsed_referer_url = urlparse(unquote(pl.referer))
        referer_url = parsed_referer_url.netloc
        referer_path = parsed_referer_url.path
        
        # Get status code
        http_status = pl.http_status
        
        # Get page id
        page_id = pl.page_id
        
        # Get timestap
        local_time = pl.local_time.timestamp()
        
        #########################################
        # Type: Reload
        # Description: Handle a very obvious case of reload (2 consecutive)
        # Action: Skip
        #########################################
        if pl_idx > 0:
            prev = row.session[pl_idx-1]
            if pl.page_title == prev.page_title and pl.referer == prev.referer:
                continue
            
        
        #########################################
        # Type: Main Page
        # Description: Main page should not be part of the navigation
        # Action: Update referer as external to Wikipedia to spawn a new tree 
        #########################################
        if referer_url in domains_wl and referer_path == "/wiki/{}".format(main_page):
            referer_url = ""
            referer_path = main_page
        if page_title == main_page:
            continue
        
        pageload_info = {#referer = referer_url+referer_path,
            "page_title": page_title,
            "page_id": page_id,
            "local_time": local_time,
            "http_status": http_status
        }
        
        # Create the node of the tree
        page_load = Node(pageload_info)
        
        
        # Check if we have to spawn a new tree
        if referer_url not in domains_wl or not referer_path.startswith("/wiki/"):
            # Create a new tree
            session_tree = Tree(pl.referer, page_load, row.user_identifier, row.access_method, row.country_code)
            trees.append(session_tree)
        # ... else if the referer is a Wikipedia page
        elif referer_url in domains_wl and referer_path.startswith("/wiki/"):
            # Search for the last time it was loaded
            referer_page_title = parsed_referer_url.path[6:]
            last_load_node = pointers_to_last.get(referer_page_title)
            # Add the child
            if last_load_node and not last_load_node.has_click(page_title):
                last_load_node.add_click(page_load)
            else:
                # Skip the next step: adding to index
                # Remember, this would be a reload and we want to keep
                # as reference the first load.
                continue
        
        # Add the node to the index
        # Info: useful to quickly find the last load of the page
        pointers_to_last[page_title] = page_load

    return trees

# def get_trees_as_dict(row):
#     return [Row(user_identifier=t.user_identifier, 
#                 referer=t.referer, access_method=t.access_method, tree_size=t.size(),
#                page_id=t.tree[0].page_load['page_id']) for t in get_trees(row)]

def get_trees_as_dict(row):
    return [t.get_as_dict() for t in get_trees(row)]

In [4]:
trees = aggregated_sessions.rdd.flatMap(get_trees_as_dict)

In [5]:
import json
trees.map(lambda r: json.dumps(r))\
        .saveAsTextFile(path="{}/trees_sessions_final.json.gz".format(DESTINATION_FOLDER), 
                        compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

In [7]:
trees.take(1)

[{'user_identifier': 'd0461388d387b1e715dcf6cad16fd2253ef3ced639a4d9e7702bb355b55d5614',
  'access_method': 'mobile web',
  'country_code': 'US',
  'referer': 'https://www.google.com/',
  'tree_size': 1,
  'tree': {'page': {'page_title': 'Pooh_Shiesty',
    'page_id': 66130325,
    'local_time': 1615663398.0,
    'http_status': '200'}}}]