# Purpose

This notebook checks that the number of comments extracted from all pages of a game's xml matches the claimed number of comments on the first page of xml for each game.

In [1]:
from boardgamegeek import BGGClient
import requests
from bs4 import BeautifulSoup
import re
from proxy_requests.proxy_requests import ProxyRequests
from retry import retry
import time
import os
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import random
import pickle

from pyspark import SparkContext
from lxml import etree

In [2]:
#This is a helper function to get the path for the spark context. If I develop many other commonly used helper
#functions I will wrap them up into a module.
def local_path(path):
    
    return 'file://'+str(os.path.abspath(os.path.curdir))+'/'+path

sc = SparkContext('local[*]','temp')

files = sc.wholeTextFiles(local_path('Data/*/*'))

In [3]:
def aggregator(xml):
    # This function will be used by map to give the game id as key and a tuple [total,comments_count] 
    # where total is either 0 or if the xml is the first page of xml the total expected comment count
    # comments count is the count of comments on the page
    
    #get the game id form the directory name
    key = ((xml[0]).split('/')[-1]).split('_')[0]
    
    #soupify the xml
    soup = BeautifulSoup(xml[1])
    
    #see if it's the first page to get expected total comment count
    try:
    
        total = int(soup.comments['totalitems'])
        
    except:
        
        total = 0
    
    
    return (key, [total, len(soup.find_all('comment'))])

def user_rating_parser(xml):
    # This function will return an rdd with game_id as key and a tuple as value
    # where the tuple is [user,rating]
    
    #get game id as key
    key = ((xml[0]).split('/')[-1]).split('_')[0]
    
    soup = BeautifulSoup(xml[1])
    
    comments = soup.find_all('comment')
    
    # get rating and username for each comment and yield k,v pair
    for comment in comments:
        
        user = comment.get('username')
        
        rating = comment.get('rating')
        
        yield (key,[user,float(rating)])
    
    
    

def add_tuples(a,b):
    #This function allows adding of values in a reduceByKey call
    return [a[0]+b[0],a[1]+b[1]]

In [5]:
#get the expected and actual comment counts by game
comment_counts = files.map(aggregator).reduceByKey(add_tuples).collect()

In [7]:
#Let's check for any games where we may have dropped some comments. 
#We will look to see if more than .1% of comments have been dropped.
#If the quantity is not zero, go back and scrape those games again.
count=0
for c in comment_counts:
    
    if abs((c[1][0]-c[1][1])/c[1][0])*100 > 0.01 and abs(c[1][0]-c[1][1])>10:
        
        print(c)
        count+=1
        
print('There were',count,'games that had more than 0.01% of comments dropped or added \n (where 0.01% corresponds to at least 10 comments)')


There were 0 games that had more than 0.01% of comments dropped or added 
 (where 0.01% corresponds to at least 10 comments)
