# Analyzing download speeds in Kentucky counties using Python

In [1]:
%matplotlib inline

from datetime import datetime

import geopandas as gp
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from shapely.geometry import Point
from adjustText import adjust_text

---

## Download data

First, download the data using the link below

In [2]:
def quarter_start(year: int, q: int) -> datetime:
    if not 1 <= q <= 4:
        raise ValueError("Quarter must be within [1, 2, 3, 4]")

    month = [1, 4, 7, 10]
    return datetime(year, month[q - 1], 1)


def get_tile_url(service_type: str, year: int, q: int) -> str:
    dt = quarter_start(year, q)

    base_url = "https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance"
    url = f"{base_url}/type%3D{service_type}/year%3D{dt:%Y}/quarter%3D{q}/{dt:%Y-%m-%d}_performance_{service_type}_tiles.zip"
    return url

In [3]:
year = 2022
t = "fixed"
# For every quarter
pbar = tqdm(range(1,5))
for q in pbar:
    tile_url = get_tile_url("fixed", year, q)
    tiles = gp.read_file(tile_url)
    tiles['year'] = year
    tiles['q'] = q
    county_url = 'https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/TABBLOCK/2020/tl_2020_{state}_tabblock20.zip'
    pbar.set_description('Tiles read')
    
    # For the states of interest (alabama, dc, georgia, maryland, virginia)
    for state in ['24']: #['01', '11', '13', '24', '51']:
        pbar.set_description('Parsing for state: %s' % state)
        counties = gp.read_file(county_url.format(state=state))
        state_counties = counties.to_crs(4326)
        tiles_in_state_counties = gp.sjoin(tiles, state_counties, how="inner", predicate='intersects')
        # convert to Mbps for easier reading
        tiles_in_state_counties['avg_d_mbps'] = tiles_in_state_counties['avg_d_kbps'] / 1000
        tiles_in_state_counties['avg_u_mbps'] = tiles_in_state_counties['avg_u_kbps'] / 1000
        tiles_in_state_counties['county'] = tiles_in_state_counties['GEOID20'].apply(lambda x: x[:5])
        
        for county in tiles_in_state_counties['county'].unique():
            pbar.set_description('Processing county: %s' % county)
            pdf = tiles_in_state_counties[tiles_in_state_counties['county'] == county]
            pdf = pdf.reindex(columns = ['GEOID20', 'avg_d_mbps', 'avg_u_mbps', 'tests', 'devices', 'avg_lat_ms', 'year', 'q'])
            save_path = '../data/%s.csv.xz' % county
            # Check if a previous one exists
            if os.path.isfile(save_path):                
                old_pdf = pd.read_csv('../data/%s.csv.xz' % county)
                pdf = pd.concat([old_pdf, pdf])
                pdf.drop_duplicates(inplace=True) # remove duplicates
            pdf.to_csv('../data/%s.csv.xz' % county, index = False)        

Processing county: 24047: 100%|██████████| 4/4 [1:09:03<00:00, 1035.88s/it]


---