# Analyze Lighthouse

Experimentation with analyzing Lighthouse scores.

In [1]:
import sys
import ast
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import altair as alt

In [3]:
this_dir = Path("__file__").parent.absolute()

In [4]:
sys.path.append(this_dir.parent)

In [5]:
sys.path.append(str(this_dir.parent / "newshomepages"))

In [6]:
import altair_theme

In [7]:
alt.themes.register('palewire', altair_theme.theme)
alt.themes.enable('palewire')

ThemeRegistry.enable('palewire')

In [8]:
import utils

Read in the dataframe

In [10]:
df = utils.get_extract_df(
    "lighthouse-sample.csv",
    usecols=[
        'handle',
        'file_name',
        'date',
        'performance',
        'accessibility',
        'seo',
        'best_practices',
    ],
    dtype={
        'handle': str,
        'file_name': str,
        'performance': float,
        'accessibility': float,
    },
    parse_dates=["date"]
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16797 entries, 0 to 16796
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   handle          16797 non-null  object        
 1   file_name       16797 non-null  object        
 2   date            16797 non-null  datetime64[ns]
 3   performance     16797 non-null  float64       
 4   accessibility   16797 non-null  float64       
 5   best_practices  16642 non-null  float64       
 6   seo             16797 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 918.7+ KB


Exclude any sites with less than 10 observations

In [12]:
observations_by_site = df.groupby("handle").size().rename("n").reset_index()

In [16]:
f"{len(observations_by_site)} total sites"

'1145 total sites'

In [14]:
alt.Chart(observations_by_site).mark_bar().encode(
    alt.X("n:Q", bin=True),
    y='count()',
)

  for col_name, dtype in df.dtypes.iteritems():


In [20]:
not_qualified = observations_by_site[observations_by_site.n < 10]

In [21]:
qualified_df = df[~df.handle.isin(not_qualified.handle)].copy()

In [23]:
f"{len(qualified_df)} qualified sites, {(len(qualified_df.handle.unique()) / len(observations_by_site))*100} of the total"

'16781 qualified sites, 99.73799126637554 of the total'

Aggregate descriptive statistics for each metric.

In [24]:
agg_df = qualified_df.groupby("handle").agg({
    'performance': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'accessibility': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'seo': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'best_practices': ['count', 'median', 'mean', 'min', 'max', 'std'],
})

In [25]:
agg_df

Unnamed: 0_level_0,performance,performance,performance,performance,performance,performance,accessibility,accessibility,accessibility,accessibility,...,seo,seo,seo,seo,best_practices,best_practices,best_practices,best_practices,best_practices,best_practices
Unnamed: 0_level_1,count,median,mean,min,max,std,count,median,mean,min,...,mean,min,max,std,count,median,mean,min,max,std
handle,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
100reporters,14,0.155,0.137857,0.04,0.18,0.043709,14,0.89,0.890000,0.89,...,0.875714,0.87,0.88,5.135526e-03,14,0.58,0.580000,0.58,0.58,0.000000e+00
11alivenews,14,0.195,0.194286,0.16,0.23,0.018694,14,0.81,0.812857,0.81,...,0.702143,0.70,0.71,4.258153e-03,14,0.67,0.670000,0.67,0.67,0.000000e+00
12khari,14,0.120,0.124286,0.10,0.16,0.017415,14,0.65,0.650000,0.65,...,0.860000,0.86,0.86,0.000000e+00,14,0.58,0.618571,0.58,0.67,4.621973e-02
12newsnow,14,0.260,0.250000,0.17,0.31,0.050230,14,0.81,0.808571,0.80,...,0.700000,0.70,0.70,4.354653e-17,14,0.75,0.737857,0.58,0.75,4.543441e-02
13wmaznews,14,0.250,0.229286,0.13,0.34,0.063786,14,0.81,0.817143,0.80,...,0.780000,0.78,0.78,0.000000e+00,14,0.75,0.750000,0.75,0.75,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zeitonline,15,0.450,0.441333,0.33,0.54,0.061280,15,0.88,0.873333,0.86,...,0.990000,0.99,0.99,0.000000e+00,15,0.92,0.920000,0.92,0.92,0.000000e+00
zerohedge,23,0.500,0.464348,0.26,0.56,0.087012,23,0.95,0.950000,0.95,...,0.905652,0.90,0.91,5.068698e-03,23,0.83,0.830000,0.83,0.83,4.099770e-17
zerohora,15,0.270,0.269333,0.25,0.31,0.014376,15,0.84,0.840000,0.84,...,0.913333,0.91,0.92,4.879500e-03,15,1.00,0.966667,0.83,1.00,6.206755e-02
zmanisrael,15,0.440,0.430000,0.36,0.51,0.040532,15,0.93,0.911333,0.84,...,0.920000,0.92,0.92,0.000000e+00,15,0.92,0.920000,0.92,0.92,0.000000e+00


Flatten the dataframe

In [26]:
flat_df = agg_df.copy()
flat_df.columns = ['_'.join(col) for col in flat_df.columns]

In [27]:
flat_df.sort_values("performance_count")

Unnamed: 0_level_0,performance_count,performance_median,performance_mean,performance_min,performance_max,performance_std,accessibility_count,accessibility_median,accessibility_mean,accessibility_min,...,seo_mean,seo_min,seo_max,seo_std,best_practices_count,best_practices_median,best_practices_mean,best_practices_min,best_practices_max,best_practices_std
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bakersfieldcali,10,0.140,0.133000,0.10,0.15,0.014181,10,0.74,0.740000,0.74,...,0.764000,0.76,0.77,0.005164,9,0.83,0.812222,0.75,0.83,3.527668e-02
observatorio,10,0.510,0.506000,0.49,0.51,0.006992,10,0.94,0.940000,0.94,...,0.900000,0.90,0.90,0.000000,10,0.75,0.750000,0.75,0.75,0.000000e+00
prensagrafica,11,0.380,0.371818,0.32,0.46,0.047078,11,0.72,0.715455,0.70,...,0.850000,0.85,0.85,0.000000,11,0.58,0.580000,0.58,0.58,0.000000e+00
lewistontribune,11,0.190,0.187273,0.14,0.23,0.027961,11,0.93,0.930000,0.93,...,0.840000,0.84,0.84,0.000000,11,0.58,0.620909,0.58,0.67,4.700097e-02
charlie_hebdo_,12,0.505,0.500000,0.39,0.60,0.073732,12,0.86,0.860000,0.86,...,0.860000,0.86,0.86,0.000000,12,0.83,0.823333,0.75,0.83,2.309401e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
npr,29,0.250,0.256207,0.11,0.38,0.066139,29,0.99,0.983793,0.96,...,0.920000,0.92,0.92,0.000000,29,0.75,0.750000,0.75,0.75,0.000000e+00
abc,29,0.210,0.210345,0.16,0.27,0.020785,29,0.60,0.602759,0.60,...,0.920000,0.92,0.92,0.000000,29,0.92,0.882759,0.83,0.92,4.511070e-02
yahoonews,29,0.450,0.435862,0.37,0.50,0.038407,29,0.88,0.874483,0.80,...,0.974828,0.97,0.98,0.005085,29,0.83,0.830000,0.83,0.83,4.196249e-17
msnbc,36,0.295,0.315278,0.20,0.42,0.082340,36,0.75,0.747222,0.74,...,0.770000,0.77,0.77,0.000000,36,0.75,0.758889,0.75,0.83,2.549821e-02


Classify the scores

In [28]:
def color_code(val):
    """Return the classification of a metric according to Google's system.
    
    Source: https://developer.chrome.com/docs/lighthouse/performance/performance-scoring/
    """
    if val >= .9:
        return 'green'
    elif val >= .5:
        return 'orange'
    else:
        return 'red'

In [29]:
flat_df['performance_color'] = flat_df.performance_median.apply(color_code)

In [30]:
flat_df['accessibility_color'] = flat_df.accessibility_median.apply(color_code)

Rank the result

In [31]:
flat_df['performance_rank'] = flat_df.performance_median.rank(ascending=False, method="min")

In [32]:
flat_df.sort_values("performance_rank")[[
    'performance_median',
    'performance_rank'
]]

Unnamed: 0_level_0,performance_median,performance_rank
handle,Unnamed: 1_level_1,Unnamed: 2_level_1
techmeme,0.99,1.0
studyfindsorg,0.99,1.0
yediotahronot,0.98,3.0
insideclimate,0.97,4.0
wonkette,0.96,5.0
...,...,...
billingsgazette,0.02,1126.0
bistrib,0.02,1126.0
dallasnews,0.02,1126.0
missoulian,0.02,1126.0


Total up the colors

In [33]:
flat_df.performance_color.value_counts()

red       749
orange    377
green      16
Name: performance_color, dtype: int64

In [34]:
flat_df.performance_color.value_counts(normalize=True)

red       0.655867
orange    0.330123
green     0.014011
Name: performance_color, dtype: float64

In [35]:
flat_df.accessibility_color.value_counts()

orange    836
green     301
red         5
Name: accessibility_color, dtype: int64

In [36]:
flat_df.accessibility_color.value_counts(normalize=True)

orange    0.732049
green     0.263573
red       0.004378
Name: accessibility_color, dtype: float64

In [37]:
flat_df.performance_median.describe()

count    1142.000000
mean        0.418783
std         0.264978
min         0.010000
25%         0.196250
50%         0.330000
75%         0.670000
max         0.990000
Name: performance_median, dtype: float64

In [38]:
flat_df.accessibility_median.describe()

count    1142.000000
mean        0.849965
std         0.091231
min         0.450000
25%         0.800000
50%         0.870000
75%         0.900000
max         1.000000
Name: accessibility_median, dtype: float64

In [39]:
chart_df = (
    qualified_df[qualified_df.handle == 'nytimes']
        .set_index(["handle", "file_name", "date"])
        .stack()
        .reset_index()
        .rename(columns={0: 'value', 'level_3': 'metric'})
)

In [40]:
chart_df['color'] = chart_df.value.apply(color_code)

In [41]:
chart_df.value = chart_df.value * 100

In [42]:
chart_df.metric = chart_df.metric.str.capitalize().str.replace("_" , " ").replace("Seo", "SEO")

In [43]:
chart_df.head()

Unnamed: 0,handle,file_name,date,metric,value,color
0,nytimes,nytimes-2023-01-02T21:28:28.187158-05:00.light...,2023-01-03,Performance,29.0,red
1,nytimes,nytimes-2023-01-02T21:28:28.187158-05:00.light...,2023-01-03,Accessibility,100.0,green
2,nytimes,nytimes-2023-01-02T21:28:28.187158-05:00.light...,2023-01-03,Best practices,83.0,orange
3,nytimes,nytimes-2023-01-02T21:28:28.187158-05:00.light...,2023-01-03,SEO,98.0,green
4,nytimes,nytimes-2023-01-03T07:29:13.517216-05:00.light...,2023-01-03,Performance,34.0,red


In [44]:
alt.Chart(chart_df).mark_tick(height=20, opacity=0.9).encode(
    x=alt.X('value:Q', axis=alt.Axis(title=None)),
    y=alt.Y('metric:O', title=None),
    color=alt.Color("color:N", legend=None, scale=alt.Scale(domain=["green", "orange", "red"], range=["green", "orange", "red"])),
    tooltip=["metric", "date", "value"]
).properties(
    title="Lighthouse scores over last 7 days",
    width=500,
    height=175
).configure_axisY(
    labelFontSize=14,
)

  for col_name, dtype in df.dtypes.iteritems():


In [45]:
def _round(val):
    return np.floor(np.floor(val * 1000)/100)*10

In [46]:
_round(0.67)

60.0

In [38]:
flat_df['performance_decile'] = flat_df.performance_median.apply(_round)

In [39]:
flat_df.head()

Unnamed: 0_level_0,performance_count,performance_median,performance_mean,performance_min,performance_max,performance_std,accessibility_count,accessibility_median,accessibility_mean,accessibility_min,...,best_practices_count,best_practices_median,best_practices_mean,best_practices_min,best_practices_max,best_practices_std,performance_color,accessibility_color,performance_rank,performance_decile
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100Reporters,15,0.16,0.163333,0.14,0.2,0.017593,15,0.89,0.89,0.89,...,15,0.58,0.58,0.58,0.58,0.0,red,orange,638.0,10.0
11AliveNews,15,0.22,0.225333,0.15,0.33,0.046884,15,0.83,0.820667,0.81,...,15,0.75,0.733333,0.58,0.75,0.047157,red,orange,503.0,20.0
12NewsNow,15,0.17,0.217333,0.08,0.36,0.081545,15,0.81,0.812667,0.81,...,15,0.83,0.802667,0.58,0.83,0.074111,red,orange,608.0,10.0
12khari,15,0.16,0.165333,0.04,0.31,0.074053,15,0.78,0.78,0.78,...,15,0.83,0.808667,0.75,0.83,0.036619,red,orange,638.0,10.0
13wmaznews,14,0.24,0.247143,0.14,0.38,0.079752,14,0.83,0.821429,0.81,...,14,0.83,0.784286,0.67,0.83,0.06813,red,orange,461.0,20.0


In [40]:
histogram_df = flat_df.performance_decile.value_counts().reset_index()

In [41]:
histogram_df['index'] = histogram_df['index'].astype(int)

In [42]:
list(range(0, 11))*10

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10]

In [43]:
alt.Chart(histogram_df).mark_bar(opacity=0.9).encode(
    alt.X("index:Q", axis=alt.Axis(format='.0f'), bin=True, title="Median score"),
    y=alt.Y('performance_decile:Q', title="Number of sites"),
).properties(
    title="Lighthouse performance scores",
    width=500
)