In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time
import glob
import inspect
sys.path.append('..')

from tqdm import tqdm

In [3]:
import utils.parsers as P
from utils.config import cat2color, window_size, user_agent
from utils.analysis import GoogleWebAssay, paint_abstract_representation

In [4]:
parser_functions = [
    o[1] for o in inspect.getmembers(P)
    if inspect.isfunction(o[1]) and
    '_parser' in o[0]
]
len(parser_functions)

55

In [5]:
assay = GoogleWebAssay(user_agent = user_agent,
                       window_size = window_size,
                       parser_functions = parser_functions,
                       color_palette = cat2color,
                       headless = False)

In [6]:
data_dir_in  = '../data/input/google_searches/'
data_dir_out = '../data/intermediary/google_searches/'

In [7]:
files_input = glob.glob(os.path.join(
    data_dir_in, 'iPhone-X/*/*/*/*/html/webpage_raw.html'))
len(files_input)

16808

In [8]:
import random
import time

In [9]:
random.seed(303)
random.shuffle(files_input)

In [10]:
def get_context(fn):
    '''
    Get file paths for output files
    '''
    fn_metadata = fn.replace(data_dir_in, data_dir_out) \
                    .replace('.html', '.ndjson') \
                    .replace('webpage_raw', 'parsed_meta') \
                    .replace('html/', 'json/')
    
    fn_stained_html = fn.replace(data_dir_in, data_dir_out) \
                        .replace('webpage_raw', 'webpage_stained')
    
    fn_screenshot = fn.replace(data_dir_in, data_dir_out) \
                      .replace('.html', '.png') \
                      .replace('webpage_raw', 'screenshot') \
                      .replace('html/', 'png/')
    
    fn_abstract_img = fn_screenshot.replace('screenshot', 
                                            'abstract_painting')
    
    return fn_metadata, fn_stained_html, fn_screenshot, fn_abstract_img

In [11]:
len(glob.glob(data_dir_out + 'iPhone-X/*/*/*/*/html/*.html'))

2217

In [None]:
for fn in tqdm(files_input[12 + 61:]):
    fn_metadata, _, fn_screenshot, fn_abstract_img = get_context(fn)
    for output in [fn_metadata, _, fn_screenshot]:
        _dir = os.path.dirname(output)
        os.makedirs(_dir, exist_ok=True)
    if os.path.exists(fn_abstract_img):
        continue
    assay.open_local_html(fn)
    assay.screenshot_full(fn_screenshot)
    assay.driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(.2)
    assay.run(fn, fn_metadata=fn_metadata, stain=False)
    if assay.element_metadata.empty:
        continue
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img.replace('.png', '_img.png'),
                                  fn_img=fn_screenshot)
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img)

  2%|▏         | 306/16735 [02:03<2:27:48,  1.85it/s]

In [16]:
fn = files_input[93 + 289 + 16]

In [19]:
fn_metadata, fn_stained_html, fn_screenshot, fn_abstract_img = get_context(fn)

In [20]:
assay.open_local_html(fn)

In [21]:
assay.run(fn_metadata=fn_metadata, stain=True)

In [16]:
df = assay.element_metadata

In [17]:
df[df['category'] == 'ads-aria']

Unnamed: 0,text,link,domain,xpath,element_class,category,element,tag,attrs,dimensions,location,area,area_page,fn_input
0,AdAd·manhattanmotorcarsporsche.com/Porsche/Spe...,,google.com,/html/body/div[10]/div/div[6]/div/div[1]/div[2...,C4eCVc|c,ads-aria,"[[Ad], [[<div class=""U3THc"" jsaction=""rcuQ6b:n...",div,"{'class': ['C4eCVc', 'c'], 'id': 'tads', 'aria...","{'height': 391.0, 'width': 344.0}","{'x': 8, 'y': 166}",134504.0,134504.0,


In [18]:
paint_abstract_representation(fn_metadata=fn_metadata,
                              fn_out=fn_abstract_img,
                              fn_img=fn_screenshot,
                              verbose=1)

python ../utils/draw_img.py --input ../data/intermediary/google_searches/iPhone-X/2019/11/05/Porsche/json/parsed_meta.ndjson --output ../data/intermediary/google_searches/iPhone-X/2019/11/05/Porsche/png/abstract_painting_img.png --img ../data/intermediary/google_searches/iPhone-X/2019/11/05/Porsche/png/screenshot.png


b'87\n'

In [21]:
from bs4 import BeautifulSoup
import pandas as pd

In [16]:
from utils.parsers import element_to_dict

In [45]:
def ads_aria_parser(body):
    """Catches ADs with a accessibility features"""
    data = []
    for elm in body.find_all(attrs={'aria-label' : 'Ad'}):
        row = element_to_dict(elm, category='ads-aria')
        data.append(row)
    return data

In [17]:
soup = BeautifulSoup(assay.driver.page_source)

In [26]:
for div in soup.find_all("div", {'id' : 'sfooter'}): 
    div.decompose()
body = soup.find("div", attrs={'id' : 'cnt'})

In [29]:
data = []
for parser in parser_functions:
    resp = parser(body)
    data.extend(resp)

In [30]:
len(data)

198

In [333]:
fn

'../data/input/google_searches/iPhone-X/2019/12/05/Minnesota-National-Guard/html/webpage_raw.html'

In [112]:
error_check = [
    '../data/input/google_searches/iPhone-X/2019/12/02/Parry-Shen/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/11/12/Hermès/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/12/14/Jonathan-Gresham/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/11/09/The-Flash/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/11/05/Petya/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/12/23/NYSEAPA/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/12/05/Minnesota-National-Guard/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/12/21/Posture/html/webpage_raw.html',
    '../data/input/google_searches/iPhone-X/2019/12/19/Credential-stuffing/html/webpage_raw.html', # weird organic link
    '../data/input/google_searches/iPhone-X/2019/11/11/Apixaban/html/webpage_raw.html', # search organic pain in the ass
]

good_examples = [
    '../data/input/google_searches/iPhone-X/2019/12/09/Patrick-Stewart/html/webpage_raw.html'
]

In [None]:
df[]

In [211]:
assay.driver.refresh()

In [354]:
assay.parser_functions = parser_functions

In [208]:
fn = files_input[89]
# fn = error_check[-1]
print(fn)

../data/input/google_searches/iPhone-X/2019/12/06/Columbia-Sportswear/html/webpage_raw.html


In [209]:
assay.open_local_html(fn)

In [213]:
assay.run(stain=True)

In [180]:
df = assay.element_metadata
df[df.category.str.contains('organic')].category.value_counts()

organic-search_result_2a    8
organic                     6
organic-tweet_2             5
organic-search_result_1a    2
Name: category, dtype: int64

In [165]:
','.join(df['element'].iloc[0].attrs.keys())

'jsname,id,data-jiis,data-async-type,data-async-context-required,class'

In [94]:
def link_parser(body):
    """
    Parses all a tags with `href` attributes. 
    Decides if the url is `organic`, or from a Google property
    such as "youtube" or google ad services.
    """
    data = []
    for elm in body.find_all('a', href=True, 
                             attrs={'data-amp' : False}):
        url = elm['href']
        domain = get_domain(url)
        category = 'link-google'
        if url in javascript:
            domain = 'google.com'
            category = 'link-javascript'
        
        # links to Google Ad services...
        elif domain[0] == '/':
            if domain.split('?')[0] == '/aclk': # check this
                category = 'ads-google_ad_services'
            domain = 'google.com'

        elif domain == 'googleadservices.com':
            category = 'ads-google_ad_services'
#             elm = elm.parent

        elif domain == 'youtube.com':
            category = 'link-youtube'
                
        # get the whole box for organic
        elif domain not in google_domains + javascript:
            category = 'organic'
            if 'data-ved' not in elm.attrs:
                # get the sibling of the parent of the link
                elm_potential_text = elm.parent.find_next_sibling('div')
                  
                if elm_potential_text:
                    if elm_potential_text.text:
                        category = 'organic-search_result_2'
                        elm = elm.parent.parent
                else:
                    elm_potential_text = elm.parent.parent.find_next_sibling('div')
                    if elm_potential_text:
                        if any(elm_potential_text.find_all('div', recursive=True,
                                                         text = True,
                                                         attrs={"role" : False,
                                                                "aria-level" : False,
                                                                "jsname" : False})):
                            category = 'organic-search_result_1'
                            elm = elm_potential_text
                            
                        elif any(elm_potential_text.find_all('span', recursive=True,
                                                         text = True,
                                                         attrs={"role" : False,
                                                                "aria-level" : False})):
                            category = 'organic-search_result_3'
                            elm = elm.parent.parent.parent
                # tweets
                if 'gws-twitter-link' in elm.attrs.get('class', []):
                    for _ in range(3):
                        elm = elm.parent
                    category = 'organic-tweet_1'
                    
        row = element_to_dict(elm, url=url, 
                              domain=domain, 
                              category=category)
        data.append(row)  
    
    return data

In [76]:
from bs4 import BeautifulSoup
from urlexpander import get_domain
import pandas as pd

from utils.config import javascript, google_domains
from utils.parsers import element_to_dict

In [88]:
soup = BeautifulSoup(assay.driver.page_source)

In [95]:
data = link_parser(soup)

In [96]:
df = pd.DataFrame(data)

In [97]:
df[df['domain'] == 'solutionsreview.com'].xpath.iloc[0]

'/html/body/div[10]/div/div[6]/div/div[3]/div/div[3]/div/div/div[1]/div[1]/div[1]/div/div/div/div[1]/div/div/div/div/div/div/div/div[2]/div[3]/a'

In [98]:
df[df['domain'] == 'solutionsreview.com']

Unnamed: 0,text,link,domain,xpath,element_class,category,element,tag,attrs
50,Solutions Review › identity-managementThe Top ...,https://solutionsreview.com/identity-managemen...,solutionsreview.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,fT6YYc,organic,"[[[], [<cite class=""iUh30 bc"">Solutions Review...",a,"{'class': ['fT6YYc'], 'href': 'https://solutio..."


In [546]:
organic = df[df.category.str.contains('organic')]

In [547]:
elm = organic.iloc[1]['element']

In [618]:
elm.name

'div'

In [601]:
elm.get_text()

'Medscape › reference › drug › eliqu...Web resultsEliquis (apixaban) dosing, indications, interactions, adverse effects ...Renal impairment (nonvalvular atrial fibrillation)  ... Serum creatinine ≥1.5 mg/dL: Decrease dose to 2.5 mg BID if patient has 1 additional characteristic of age ≥80 years or weight ≤60 kg.  ... Switching between apixaban and anticoagulants other than warfarin: Discontinue one being taken ...People also search for'

In [555]:
any([x in ','.join((elm.attrs.keys())) for x in ['data-', 'jsname']])

False

In [122]:
from utils.analysis import xpath_prune

In [133]:
len(df)

126

In [124]:
len(xpath_prune(df))

125

In [131]:
xpath_prune(df[df.category=='organic-tweet'])

Unnamed: 0,text,link,domain,xpath,element_class,category,element,tag,attrs,dimensions,location,area,area_page,fn_input
121,I'm conscious that a little bit of harmonica g...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'jsc...","{'height': 260.0, 'width': 232.0}","{'x': 24, 'y': 4374}",0.0,60320.0,
122,"“While We’re Young"" is a plucky pop rock tune ...",,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'jsc...","{'height': 260.0, 'width': 232.0}","{'x': 264, 'y': 4374}",0.0,28860.0,
123,From yesterday's New York Times Mini Crossword...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 504, 'y': 4374}",0.0,0.0,
124,On sale now at www.hueylewisandthen… Twitter ·...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 744, 'y': 4374}",0.0,0.0,
125,My dad was a drummer and he always had a set o...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 984, 'y': 4374}",0.0,0.0,


In [132]:
df[df.category=='organic-tweet']

Unnamed: 0,text,link,domain,xpath,element_class,category,element,tag,attrs,dimensions,location,area,area_page,fn_input
121,I'm conscious that a little bit of harmonica g...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'jsc...","{'height': 260.0, 'width': 232.0}","{'x': 24, 'y': 4374}",0.0,60320.0,
122,"“While We’re Young"" is a plucky pop rock tune ...",,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'jsc...","{'height': 260.0, 'width': 232.0}","{'x': 264, 'y': 4374}",0.0,28860.0,
123,From yesterday's New York Times Mini Crossword...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 504, 'y': 4374}",0.0,0.0,
124,On sale now at www.hueylewisandthen… Twitter ·...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 744, 'y': 4374}",0.0,0.0,
125,My dad was a drummer and he always had a set o...,,twitter.com,/html/body/div[10]/div/div[6]/div/div[3]/div/d...,EhKlzb|ttGZ5d|dHOsHb,organic-tweet,"[[[m, <div class=""Brgz0 tw-res"" data-ved=""2ahU...",div,"{'class': ['EhKlzb', 'ttGZ5d', 'dHOsHb'], 'id'...","{'height': 260.0, 'width': 232.0}","{'x': 984, 'y': 4374}",0.0,0.0,


In [None]:
# maybe get atext in