# Analyze Lighthouse

Experimentation with analyzing Lighthouse scores.

In [1]:
import sys
import ast
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import altair as alt

In [3]:
this_dir = Path("__file__").parent.absolute()

In [4]:
sys.path.append(this_dir.parent)

In [5]:
sys.path.append(str(this_dir.parent / "newshomepages"))

In [6]:
import altair_theme

In [7]:
alt.themes.register('palewire', altair_theme.theme)
alt.themes.enable('palewire')

ThemeRegistry.enable('palewire')

In [8]:
extracts_dir = this_dir.parent / "extracts" / "csv"

In [9]:
analysis_dir = this_dir.parent / "_analysis"

Read in the dataframe

In [10]:
df = pd.read_csv(
    extracts_dir / "lighthouse-sample.csv",
    usecols=[
        'handle',
        'file_name',
        'date',
        'performance',
        'accessibility',
        'seo',
        'best_practices',
    ],
    dtype={
        'handle': str,
        'file_name': str,
        'performance': float,
        'accessibility': float,
    },
    parse_dates=["date"]
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12630 entries, 0 to 12629
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   handle          12630 non-null  object        
 1   file_name       12630 non-null  object        
 2   date            12630 non-null  datetime64[ns]
 3   performance     12628 non-null  float64       
 4   accessibility   12630 non-null  float64       
 5   best_practices  12491 non-null  float64       
 6   seo             12630 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 690.8+ KB


Exclude any sites with less than 10 observations

In [12]:
observations_by_site = df.groupby("handle").size().rename("n").reset_index()

In [13]:
not_qualified = observations_by_site[observations_by_site.n < 10]

In [14]:
qualified_df = df[~df.handle.isin(not_qualified.handle)].copy()

Aggregate descriptive statistics for each metric.

In [15]:
agg_df = qualified_df.groupby("handle").agg({
    'performance': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'accessibility': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'seo': ['count', 'median', 'mean', 'min', 'max', 'std'],
    'best_practices': ['count', 'median', 'mean', 'min', 'max', 'std'],
})

In [16]:
agg_df

Unnamed: 0_level_0,performance,performance,performance,performance,performance,performance,accessibility,accessibility,accessibility,accessibility,...,seo,seo,seo,seo,best_practices,best_practices,best_practices,best_practices,best_practices,best_practices
Unnamed: 0_level_1,count,median,mean,min,max,std,count,median,mean,min,...,mean,min,max,std,count,median,mean,min,max,std
handle,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
100Reporters,15,0.160,0.163333,0.14,0.20,0.017593,15,0.89,0.890000,0.89,...,0.875333,0.87,0.88,0.005164,15,0.58,0.580000,0.58,0.58,0.000000
11AliveNews,15,0.220,0.225333,0.15,0.33,0.046884,15,0.83,0.820667,0.81,...,0.700000,0.70,0.70,0.000000,15,0.75,0.733333,0.58,0.75,0.047157
12NewsNow,15,0.170,0.217333,0.08,0.36,0.081545,15,0.81,0.812667,0.81,...,0.700000,0.70,0.70,0.000000,15,0.83,0.802667,0.58,0.83,0.074111
12khari,15,0.160,0.165333,0.04,0.31,0.074053,15,0.78,0.780000,0.78,...,0.850000,0.85,0.85,0.000000,15,0.83,0.808667,0.75,0.83,0.036619
13wmaznews,14,0.240,0.247143,0.14,0.38,0.079752,14,0.83,0.821429,0.81,...,0.777143,0.77,0.78,0.004688,14,0.83,0.784286,0.67,0.83,0.068130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yorkdispatch,14,0.830,0.817143,0.70,0.86,0.047786,14,0.88,0.882857,0.88,...,0.930000,0.93,0.93,0.000000,14,0.92,0.920000,0.92,0.92,0.000000
zeitonline,14,0.505,0.492857,0.39,0.58,0.052540,14,0.89,0.888571,0.86,...,0.990000,0.99,0.99,0.000000,14,0.92,0.920000,0.92,0.92,0.000000
zerohedge,20,0.455,0.447000,0.30,0.54,0.061738,20,0.95,0.950000,0.95,...,0.898500,0.89,0.90,0.003663,20,0.92,0.915500,0.83,0.92,0.020125
zerohora,19,0.260,0.259474,0.24,0.29,0.011773,19,0.84,0.840000,0.84,...,0.910000,0.91,0.91,0.000000,19,0.92,0.943684,0.83,1.00,0.062912


Flatten the dataframe

In [17]:
flat_df = agg_df.copy()
flat_df.columns = ['_'.join(col) for col in flat_df.columns]

In [18]:
flat_df.sort_values("performance_count")

Unnamed: 0_level_0,performance_count,performance_median,performance_mean,performance_min,performance_max,performance_std,accessibility_count,accessibility_median,accessibility_mean,accessibility_min,...,seo_mean,seo_min,seo_max,seo_std,best_practices_count,best_practices_median,best_practices_mean,best_practices_min,best_practices_max,best_practices_std
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ThahaKhabar,10,0.38,0.366000,0.21,0.47,0.074416,10,0.82,0.820000,0.82,...,0.760000,0.76,0.76,0.000000,10,0.75,0.750000,0.75,0.75,0.000000
oronline,11,0.07,0.105455,0.05,0.21,0.062508,11,0.81,0.802727,0.79,...,0.750000,0.75,0.75,0.000000,11,0.75,0.803636,0.75,0.92,0.078393
tass_agency,11,1.00,0.916364,0.08,1.00,0.277390,11,0.71,0.713636,0.71,...,0.710909,0.69,0.92,0.069348,10,0.83,0.830000,0.83,0.83,0.000000
OANN,11,0.12,0.133636,0.08,0.23,0.041779,11,0.87,0.871818,0.87,...,0.910000,0.91,0.91,0.000000,11,0.83,0.830000,0.83,0.83,0.000000
occrp,12,0.57,0.570000,0.53,0.61,0.022563,12,0.86,0.860000,0.86,...,0.840000,0.84,0.84,0.000000,12,0.92,0.920000,0.92,0.92,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
foxnews,29,0.18,0.179655,0.14,0.23,0.015232,29,0.83,0.832759,0.83,...,0.820000,0.82,0.82,0.000000,29,0.83,0.830000,0.83,0.83,0.000000
ajc,33,0.09,0.102424,0.06,0.14,0.025376,33,0.68,0.680000,0.68,...,0.840000,0.84,0.84,0.000000,33,0.83,0.830000,0.83,0.83,0.000000
globeandmail,33,0.19,0.180000,0.10,0.28,0.047566,33,1.00,0.996970,0.98,...,0.975152,0.92,0.99,0.029060,33,0.92,0.917273,0.83,0.92,0.015667
baltimoresun,35,0.09,0.093429,0.06,0.17,0.027110,35,0.86,0.869143,0.86,...,0.844000,0.84,0.86,0.008117,35,0.75,0.750000,0.75,0.75,0.000000


Classify the scores

In [19]:
def color_code(val):
    """Return the classification of a metric according to Google's system.
    
    Source: https://developer.chrome.com/docs/lighthouse/performance/performance-scoring/
    """
    if val >= .9:
        return 'green'
    elif val >= .5:
        return 'orange'
    else:
        return 'red'

In [20]:
flat_df['performance_color'] = flat_df.performance_median.apply(color_code)

In [21]:
flat_df['accessibility_color'] = flat_df.accessibility_median.apply(color_code)

Rank the result

In [22]:
flat_df['performance_rank'] = flat_df.performance_median.rank(ascending=False, method="min")

In [23]:
flat_df.sort_values("performance_rank")[[
    'performance_median',
    'performance_rank'
]]

Unnamed: 0_level_0,performance_median,performance_rank
handle,Unnamed: 1_level_1,Unnamed: 2_level_1
tass_agency,1.000,1.0
insideclimate,1.000,1.0
techmeme,0.975,3.0
gigharbornow,0.960,4.0
lobs,0.955,5.0
...,...,...
abc7breaking,0.030,760.0
BostonHerald,0.020,771.0
JournalStarNews,0.020,771.0
portalimprensa,0.020,771.0


Total up the colors

In [24]:
flat_df.performance_color.value_counts()

red       599
orange    166
green       9
Name: performance_color, dtype: int64

In [25]:
flat_df.performance_color.value_counts(normalize=True)

red       0.773902
orange    0.214470
green     0.011628
Name: performance_color, dtype: float64

In [26]:
flat_df.accessibility_color.value_counts()

orange    532
green     238
red         4
Name: accessibility_color, dtype: int64

In [27]:
flat_df.accessibility_color.value_counts(normalize=True)

orange    0.687339
green     0.307494
red       0.005168
Name: accessibility_color, dtype: float64

In [28]:
flat_df.performance_median.describe()

count    774.000000
mean       0.347836
std        0.213956
min        0.010000
25%        0.190000
50%        0.280000
75%        0.473750
max        1.000000
Name: performance_median, dtype: float64

In [29]:
flat_df.accessibility_median.describe()

count    774.000000
mean       0.844548
std        0.099731
min        0.430000
25%        0.790000
50%        0.860000
75%        0.920000
max        1.000000
Name: accessibility_median, dtype: float64

In [30]:
chart_df = (
    qualified_df[qualified_df.handle == 'nytimes']
        .set_index(["handle", "file_name", "date"])
        .stack()
        .reset_index()
        .rename(columns={0: 'value', 'level_3': 'metric'})
)

In [31]:
chart_df['color'] = chart_df.value.apply(color_code)

In [32]:
chart_df.value = chart_df.value * 100

In [33]:
chart_df.metric = chart_df.metric.str.capitalize().str.replace("_" , " ").replace("Seo", "SEO")

In [34]:
chart_df.head()

Unnamed: 0,handle,file_name,date,metric,value,color
0,nytimes,nytimes-2022-08-11T01:19:38.394745-04:00.light...,2022-08-11,Performance,36.0,red
1,nytimes,nytimes-2022-08-11T01:19:38.394745-04:00.light...,2022-08-11,Accessibility,100.0,green
2,nytimes,nytimes-2022-08-11T01:19:38.394745-04:00.light...,2022-08-11,Best practices,83.0,orange
3,nytimes,nytimes-2022-08-11T01:19:38.394745-04:00.light...,2022-08-11,SEO,98.0,green
4,nytimes,nytimes-2022-08-11T12:55:19.137945-04:00.light...,2022-08-11,Performance,26.0,red


In [35]:
alt.Chart(chart_df).mark_tick(height=20, opacity=0.9).encode(
    x=alt.X('value:Q', axis=alt.Axis(title=None)),
    y=alt.Y('metric:O', title=None),
    color=alt.Color("color:N", legend=None, scale=alt.Scale(domain=["green", "orange", "red"], range=["green", "orange", "red"])),
    tooltip=["metric", "date", "value"]
).properties(
    title="Lighthouse scores over last 7 days",
    width=500,
    height=175
).configure_axisY(
    labelFontSize=14,
)

In [36]:
def _round(val):
    return np.floor(np.floor(val * 1000)/100)*10

In [37]:
_round(0.67)

60.0

In [38]:
flat_df['performance_decile'] = flat_df.performance_median.apply(_round)

In [39]:
flat_df.head()

Unnamed: 0_level_0,performance_count,performance_median,performance_mean,performance_min,performance_max,performance_std,accessibility_count,accessibility_median,accessibility_mean,accessibility_min,...,best_practices_count,best_practices_median,best_practices_mean,best_practices_min,best_practices_max,best_practices_std,performance_color,accessibility_color,performance_rank,performance_decile
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100Reporters,15,0.16,0.163333,0.14,0.2,0.017593,15,0.89,0.89,0.89,...,15,0.58,0.58,0.58,0.58,0.0,red,orange,638.0,10.0
11AliveNews,15,0.22,0.225333,0.15,0.33,0.046884,15,0.83,0.820667,0.81,...,15,0.75,0.733333,0.58,0.75,0.047157,red,orange,503.0,20.0
12NewsNow,15,0.17,0.217333,0.08,0.36,0.081545,15,0.81,0.812667,0.81,...,15,0.83,0.802667,0.58,0.83,0.074111,red,orange,608.0,10.0
12khari,15,0.16,0.165333,0.04,0.31,0.074053,15,0.78,0.78,0.78,...,15,0.83,0.808667,0.75,0.83,0.036619,red,orange,638.0,10.0
13wmaznews,14,0.24,0.247143,0.14,0.38,0.079752,14,0.83,0.821429,0.81,...,14,0.83,0.784286,0.67,0.83,0.06813,red,orange,461.0,20.0


In [40]:
histogram_df = flat_df.performance_decile.value_counts().reset_index()

In [41]:
histogram_df['index'] = histogram_df['index'].astype(int)

In [42]:
list(range(0, 11))*10

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10]

In [43]:
alt.Chart(histogram_df).mark_bar(opacity=0.9).encode(
    alt.X("index:Q", axis=alt.Axis(format='.0f'), bin=True, title="Median score"),
    y=alt.Y('performance_decile:Q', title="Number of sites"),
).properties(
    title="Lighthouse performance scores",
    width=500
)