In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import pandas as pd
from tqdm import tqdm

from aggregators import bucket_and_bin
from config import city2ap

In [13]:
# inputs
fn_att = '../data/output/speed_price_att.csv.gz'
fn_centurylink = '../data/output/speed_price_centurylink.csv.gz'
fn_verizon = '../data/output/speed_price_verizon.csv.gz'
fn_earthlink = '../data/output/speed_price_earthlink.csv.gz'
inputs = {
    "AT&T" : fn_att,
    "CenturyLink": fn_centurylink,
    "Verizon": fn_verizon,
    "EarthLink" : fn_earthlink
}

dir_out = '../data/output/by_city'
os.makedirs(dir_out, exist_ok=True)

In [4]:
def filter_df(fn, isp):
    """
    Filters out no service offers, and cities which we can't analyze
    """
    df = pd.read_csv(fn)
    df = df[df.speed_down != 0]
    df = bucket_and_bin(df)
    df['isp'] = isp
    if isp == 'Verizon':
        df.price = df.price.replace({40: 39.99, 49.99: 39.99})
        df = df[df.price == 39.99]
        nyc_cities = ['new york', 'brooklyn', 'queens', 'staten island', 'brooklyn', 'bronx']
        nyc = []
        for city, _df in df.groupby('major_city'):
            if city in nyc_cities:
                nyc.extend(_df.to_dict(orient='records'))
        nyc = pd.DataFrame(nyc)
        nyc['major_city'] = 'new york city'
        
        # add NYC
        df = df[~df.major_city.isin(nyc_cities)]
        df = df.append(nyc)
        
    elif isp == 'EarthLink':
        df = df[df.contract_provider.isin(['AT&T', 'CenturyLink'])]
        
    homogenous_cities = {'bridgeport', 'wilmington'}
    df = df[~df.major_city.isin(homogenous_cities)]
    return df

In [6]:
cols_to_keep = [
    'address_full', 
    'major_city', 'state', 
    'lat', 'lon', 
    'block_group', 
    'collection_datetime',
    'provider', 
    'price', 
    'speed_down', 'speed_up', 
    'speed_unit',
    'technology', 'package', 
    'fastest_speed_down', 
    'fastest_speed_price',
    'speed_down_bins',
    'redlining_grade',
    'race_perc_non_white', 
    'race_quantile',
    'median_household_income', 
    'income_dollars_below_median', 
    'income_level', 
    'ppl_per_sq_mile',
    'n_providers', 
    'internet_perc_broadband',
]

In [8]:
provider2provider = {
    'at&t' : 'AT&T',
    'centurylink': 'CenturyLink',
    'earthlink': 'EarthLink',
    'verizon': 'Verizon'
}

In [9]:
to_remove = {
    'cleveland', 
    'memphis', 
    'baltimore',
    'providence',
    'new york city',
    'philadelphia',
    'boston'
}

In [14]:
for isp, fn in inputs.items():
    fn_out = fn.replace('../data/output/', dir_out)
    if isp == 'EarthLink':
        cols_to_keep_ = cols_to_keep + ['contract_provider']
    else:
        cols_to_keep_ = cols_to_keep.copy()
    
    df = filter_df(fn, isp=isp)
    data = []
    for city, _df in tqdm(df.groupby('major_city')):
        _df =  bucket_and_bin(_df)
        _df[cols_to_keep_].to_csv(os.path.join(dir_out, f'{city}_{isp.lower()}_plans.csv'),
                               index=False)
        data.extend(_df.to_dict('records'))

100%|██████████| 20/20 [00:11<00:00,  1.78it/s]
100%|██████████| 15/15 [00:06<00:00,  2.46it/s]
  df = df.append(nyc)
100%|██████████| 8/8 [00:07<00:00,  1.04it/s]
100%|██████████| 33/33 [00:12<00:00,  2.54it/s]
