In [1]:
import os
import sys
from collections import Counter
from multiprocessing import Pool
import glob

import pandas as pd
from PIL import Image
from tqdm import tqdm

sys.path.append('..')
from utils.config import width, height

In [4]:
data_dir_metadata = '../data/intermediary/google_search/'

In [5]:
painted_files = glob.glob(os.path.join(data_dir_metadata + 'iPhone-X/*/*/*/*/png/abstract_painting.png'))
len(painted_files)

14976

In [6]:
boarder = (0, 0, width, height)

In [7]:
def count_pixels(fn):
    """
    Opens a file from s3, counts the color of all pixels for the full page and the first glance
    """
    with Image.open(fn) as img:
        colors_full = Counter()
        colors_fg = Counter()

        colors_full.update(img.getdata())
        colors_fg.update(img.crop(boarder).getdata())

        row = {
            'fn' : fn,
            'pixels_fullpage' : dict(colors_full),
            'pixels_firstglance' : dict(colors_fg)
        }
        
        return row

In [8]:
n_processes = 12

data = []
with Pool(n_processes) as pool:
    for record in tqdm(pool.imap_unordered(count_pixels, painted_files), 
                       total=len(painted_files)):
        data.append(record)

100%|██████████| 14976/14976 [09:55<00:00, 25.13it/s]


In [10]:
rgb2color = {
    (255, 255, 255) : 'white',
    (234, 67, 53) : 'ads',
    (178, 178, 178) : 'organic',
    (251, 188, 5) : 'links',
    (52, 168, 83) : 'answers',
    (108, 177, 238) : 'amp'  
}

In [11]:
data[1]

{'fn': '../data/intermediary/google_search/iPhone-X/2019/11/28/Swollen-testicle/png/abstract_painting.png',
 'pixels_fullpage': {(255, 255, 255): 325053,
  (251, 188, 5): 202057,
  (52, 168, 83): 125864,
  (178, 178, 178): 740151},
 'pixels_firstglance': {(255, 255, 255): 156001,
  (251, 188, 5): 99919,
  (52, 168, 83): 48580}}

In [12]:
record = data[0]

In [13]:
records_full = []
records_fg = []
for record in tqdm(data):
    fg = {rgb2color.get(k, k) : v for k, v in record['pixels_firstglance'].items()}
    full = {rgb2color.get(k, k) : v for k, v in record['pixels_fullpage'].items()}

    row = {'fn' : record['fn']}
    records_fg.append({**row, **fg})
    records_full.append({**row, **full})

100%|██████████| 14976/14976 [00:00<00:00, 268574.33it/s]


In [14]:
fg = pd.DataFrame(records_fg)

In [25]:
full = pd.DataFrame(records_full)

In [47]:
label2publabel = {
    'link' : 'Google Product',
    'answer' : 'Google Answer',
    'organic' : 'Organic',
    'amp' : 'AMP',
    'ads' : 'Ads'
}

In [52]:
def proportions(df):
    link = df['links'].sum()
    answer = df['answers'].sum()
    ads = df['ads'].sum()
    amp = df['amp'].sum()
    o = df['organic'].sum()
    
    d = link + answer + ads + amp + o
    return {
        'answer' : answer / d,
        'link' : link /d,
        'amp' : amp/d,
        'organic' : o/d,
        'ads' : ads/d
    }

In [53]:
proportions(fg)

{'answer': 0.23109137685631306,
 'link': 0.38575460372902404,
 'amp': 0.12710702159197135,
 'organic': 0.18918866281675353,
 'ads': 0.066858335005938}

In [54]:
results = pd.DataFrame([proportions(fg),
                        proportions(full)])
results.columns = [label2publabel.get(c) for c in results.columns]

results = results.T
results.columns = pd.MultiIndex.from_tuples([   
    ('Above the fold', 'area'),
    ('First page', 'area'),
])

results

Unnamed: 0_level_0,Above the fold,First page
Unnamed: 0_level_1,area,area
Google Answer,0.231091,0.132422
Google Product,0.385755,0.275515
AMP,0.127107,0.1368
Organic,0.189189,0.445628
Ads,0.066858,0.009634


In [61]:
fn_cats = '../data/outputs/T1_area_freq_by_category.csv'
breakdown = pd.read_csv(fn_cats, index_col=0, header=[0, 1])

In [66]:
breakdown

Unnamed: 0_level_0,Above the fold,Above the fold,First page,First page
Unnamed: 0_level_1,area,frequency,area,Frequency
Google Answer,0.23087,0.664196,0.133166,0.993523
Google Product,0.394926,0.923945,0.277751,0.999933
AMP,0.124188,0.202925,0.136066,0.807225
Organic,0.184808,0.745393,0.443435,0.999733
Ads,0.065208,0.078592,0.009583,0.095152


In [75]:
diff = ((breakdown[[('Above the fold', 'area'),
           ('First page', 'area')]] - results) * 100).round(1).astype(str) + '%'
diff

Unnamed: 0_level_0,Above the fold,First page
Unnamed: 0_level_1,area,area
Google Answer,-0.0%,0.1%
Google Product,0.9%,0.2%
AMP,-0.3%,-0.1%
Organic,-0.4%,-0.2%
Ads,-0.2%,-0.0%


In [77]:
print(diff.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th>Above the fold</th>
      <th>First page</th>
    </tr>
    <tr>
      <th></th>
      <th>area</th>
      <th>area</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Google Answer</th>
      <td>-0.0%</td>
      <td>0.1%</td>
    </tr>
    <tr>
      <th>Google Product</th>
      <td>0.9%</td>
      <td>0.2%</td>
    </tr>
    <tr>
      <th>AMP</th>
      <td>-0.3%</td>
      <td>-0.1%</td>
    </tr>
    <tr>
      <th>Organic</th>
      <td>-0.4%</td>
      <td>-0.2%</td>
    </tr>
    <tr>
      <th>Ads</th>
      <td>-0.2%</td>
      <td>-0.0%</td>
    </tr>
  </tbody>
</table>
