# Extract Progressive auto table

Author: Mo Al Elew

**What notebook does/produces:**

Extracts territory rate setting tables from Auto Club filing PDFs and produces data files for analysis

**Issue/problem being solved:**

- Table is stored in a PDF

**Strategy to solve:**

General pattern:

1. Extract table data from PDF
2. Clean and process the data into a standardized format
3. Run any necessary caclulations to produce location effect figure
4. Match on place name to the numeric identifier

**Sources:**

- data.census.gov
- https://www2.census.gov/geo/tiger/TIGER2020/
- System for Electronic Rates and Forms Filing (SERFF)

Note: demographic data is not matched on since it is not clear what geographies are used by Progressive.

In [1]:
import pandas as pd
import stringcase
import tabula

# Constants

In [2]:
LOC_FACTOR_PDF_FP = "./inputs/202301-prog-michigan-location-factor-table.pdf"
LOC_TO_TERRITORY_PDF_FP = (
    "./inputs/202301-prog-michigan-location-to-territory-table.pdf"
)

BASE_COLUMNS = [
    "company",
    "vehicle_type",
    "factor_circumvented",
    "factor_name",
    "geography_factor",
    "geography_type",
    "geography_factor_id",
    "latitude",
    "longitude",
    "county_fips",
    "tract_fips",
    "block_group_fips",
    "zip",
    "place_name_fips",
]

COMPANY_NAME = "Progressive"
VEHICLE_TYPE = "Auto"
FACTOR_CIRCUMVENTED = "zip"
FACTOR_NAME = "Vehicle Garaging Location Factor"
GEOGRAPHY_FACTOR = "Territory Code"
GEOGRAPHY_TYPE = "place_name"

PROCESSED_EXPORT_FP = "./outputs/progressive_auto.csv"
PROCESSED_NUM_ONLY_EXPORT_FP = "./outputs/progressive_auto_num_only.csv"

In [3]:
# pulled by hand from ./inputs/auto_base_rates.pdf
BASE_RATES = {
    "BI": 73.44,
    "PD": 8.4,
    "LPD": 6.3,
    "PPI": 20.63,
    "COMP": 170.23,
    "COLL": 431.67,
    "LOAN": 16.27,
    "ME": 707.97,
    "WL": 37.5,
    "ACR": 15.93,
    "UM/UIM": 29.5,
    "RENT": 27.39,
    "TOW": 5.41,
    "ACPE": 5.17,
    "COMP-TRLR": 51.48,
    "COLL-TRLR": 61.02,
    "OPS-EXP": 23.26,
}

In [4]:
RATE_PREFIX = "rate_"


def get_rate_columns(df_rate_table):
    return [col for col in df_rate_table.columns if col.startswith(RATE_PREFIX)]


def get_avg_rate_factor(df_rate_table):
    rate_cols = get_rate_columns(df_rate_table)
    return df_rate_table[rate_cols].mean(axis=1)

# Read rate table

In [5]:
col2str = {"dtype": str}
kwargs = {"pandas_options": col2str}
tables = tabula.read_pdf(LOC_FACTOR_PDF_FP, pages="all", stream=True, **kwargs)

DF_LOC_TABLE = pd.concat(tables)
DF_LOC_TABLE

Unnamed: 0,Territory Code,BI,PD/LPD/PPI,COMP,COLL,LOAN,ME/WL/ACR,UM/UIM,RENT,TOW,ACPE,COMP-TRLR,COLL-TRLR
0,1,0.90,0.88,1.47,1.13,1.47,0.91,0.91,1.13,1.13,1.47,1.47,1.13
1,2,1.06,2.23,1.00,0.74,1.00,0.85,0.85,0.74,0.74,1.00,1.00,0.74
2,3,1.01,1.05,1.96,0.74,1.96,0.59,0.59,0.74,0.74,1.96,1.96,0.74
3,4,1.24,0.95,0.51,0.81,0.51,0.77,0.77,0.81,0.81,0.51,0.51,0.81
4,5,1.24,0.96,1.01,1.24,1.01,1.16,1.16,1.24,1.24,1.01,1.01,1.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,389,1.30,0.84,0.56,1.48,0.56,1.00,1.00,1.48,1.48,0.56,0.56,1.48
19,390,1.12,0.95,0.77,1.29,0.77,1.33,1.33,1.29,1.29,0.77,0.77,1.29
20,391,1.14,1.08,2.19,0.73,2.19,0.98,0.98,0.73,0.73,2.19,2.19,0.73
21,392,0.67,0.84,0.55,0.93,0.55,0.49,0.49,0.93,0.93,0.55,0.55,0.93


# Build processed table

In [6]:
DF_LOC_TABLE[DF_LOC_TABLE.columns[1:]] = DF_LOC_TABLE[DF_LOC_TABLE.columns[1:]].astype(
    float
)

In [7]:
df_processed_table = DF_LOC_TABLE.copy()

## Rename columns

In [8]:
def create_rate_label(col_name):
    return RATE_PREFIX + stringcase.snakecase(col_name.lower().replace("/", " "))


def rename_rate_columns(df):
    rate_col_names = df.columns[1:]
    extracted_rate_col_names = [
        create_rate_label(col_name) for col_name in rate_col_names
    ]
    rate_col_renames = dict(zip(rate_col_names, extracted_rate_col_names))
    return df.rename(columns=rate_col_renames)


df_processed_table = rename_rate_columns(df_processed_table)
rate_col_names = list(df_processed_table.columns[1:])
df_processed_table

Unnamed: 0,Territory Code,rate_bi,rate_pd_lpd_ppi,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,rate_acpe,rate_comp_trlr,rate_coll_trlr
0,1,0.90,0.88,1.47,1.13,1.47,0.91,0.91,1.13,1.13,1.47,1.47,1.13
1,2,1.06,2.23,1.00,0.74,1.00,0.85,0.85,0.74,0.74,1.00,1.00,0.74
2,3,1.01,1.05,1.96,0.74,1.96,0.59,0.59,0.74,0.74,1.96,1.96,0.74
3,4,1.24,0.95,0.51,0.81,0.51,0.77,0.77,0.81,0.81,0.51,0.51,0.81
4,5,1.24,0.96,1.01,1.24,1.01,1.16,1.16,1.24,1.24,1.01,1.01,1.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,389,1.30,0.84,0.56,1.48,0.56,1.00,1.00,1.48,1.48,0.56,0.56,1.48
19,390,1.12,0.95,0.77,1.29,0.77,1.33,1.33,1.29,1.29,0.77,0.77,1.29
20,391,1.14,1.08,2.19,0.73,2.19,0.98,0.98,0.73,0.73,2.19,2.19,0.73
21,392,0.67,0.84,0.55,0.93,0.55,0.49,0.49,0.93,0.93,0.55,0.55,0.93


## Calculate base rates

In [9]:
base_rates_relativity_match = {
    create_rate_label(key): value for key, value in BASE_RATES.items()
}
combined_coverage_relatives = set(rate_col_names) - set(
    base_rates_relativity_match.keys()
)
print(
    f"In the territories table these coverage share a single relativity: {combined_coverage_relatives}"
)

In the territories table these coverage share a single relativity: {'rate_pd_lpd_ppi', 'rate_me_wl_acr'}


In [10]:
base_rates_relativity_match["rate_pd_lpd_ppi"] = (
    base_rates_relativity_match["rate_pd"]
    + base_rates_relativity_match["rate_lpd"]
    + base_rates_relativity_match["rate_ppi"]
)
base_rates_relativity_match["rate_me_wl_acr"] = (
    base_rates_relativity_match["rate_me"]
    + base_rates_relativity_match["rate_wl"]
    + base_rates_relativity_match["rate_acr"]
)

In [11]:
for col in df_processed_table.columns[1:]:
    df_processed_table[col] = df_processed_table[col] * base_rates_relativity_match[col]

## Calculate location base rate

The rates are doubled to update from six months to 12 

In [12]:
df_processed_table[get_rate_columns(df_processed_table)] = df_processed_table[
    get_rate_columns(df_processed_table)
].astype(int)
df_processed_table["generic_location_based_premium"] = df_processed_table[
    get_rate_columns(df_processed_table)
].sum(axis=1)
df_processed_table["generic_location_based_premium"] = (
    df_processed_table["generic_location_based_premium"]
    + df_processed_table["generic_location_based_premium"]
)
df_processed_table

Unnamed: 0,Territory Code,rate_bi,rate_pd_lpd_ppi,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,rate_acpe,rate_comp_trlr,rate_coll_trlr,generic_location_based_premium
0,1,66,31,250,487,23,692,26,30,6,7,75,68,3522
1,2,77,78,170,319,16,647,25,20,4,5,51,45,2914
2,3,74,37,333,319,31,449,17,20,4,10,100,45,2878
3,4,91,33,86,349,8,586,22,22,4,2,26,49,2556
4,5,91,33,171,535,16,883,34,33,6,5,51,75,3866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,389,95,29,95,638,9,761,29,40,8,2,28,90,3648
19,390,82,33,131,556,12,1012,39,35,6,3,39,78,4052
20,391,83,38,372,315,35,746,28,19,3,11,112,44,3612
21,392,49,29,93,401,8,373,14,25,5,2,28,56,2166


## Index to median

In [13]:
median_loc_base_rate = df_processed_table["generic_location_based_premium"].median()
df_processed_table["location_effect"] = round(
    df_processed_table["generic_location_based_premium"] / median_loc_base_rate, 2
)

## Geography columns

Rename geography index column 

In [14]:
df_processed_table = df_processed_table.rename(
    columns={"Territory Code": "geography_factor_id"}
)
df_processed_table

Unnamed: 0,geography_factor_id,rate_bi,rate_pd_lpd_ppi,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,rate_acpe,rate_comp_trlr,rate_coll_trlr,generic_location_based_premium,location_effect
0,1,66,31,250,487,23,692,26,30,6,7,75,68,3522,1.15
1,2,77,78,170,319,16,647,25,20,4,5,51,45,2914,0.95
2,3,74,37,333,319,31,449,17,20,4,10,100,45,2878,0.94
3,4,91,33,86,349,8,586,22,22,4,2,26,49,2556,0.83
4,5,91,33,171,535,16,883,34,33,6,5,51,75,3866,1.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,389,95,29,95,638,9,761,29,40,8,2,28,90,3648,1.19
19,390,82,33,131,556,12,1012,39,35,6,3,39,78,4052,1.32
20,391,83,38,372,315,35,746,28,19,3,11,112,44,3612,1.18
21,392,49,29,93,401,8,373,14,25,5,2,28,56,2166,0.71


Set geography values

In [15]:
df_processed_table["place_name_fips"] = df_processed_table["geography_factor_id"].copy()
df_processed_table

Unnamed: 0,geography_factor_id,rate_bi,rate_pd_lpd_ppi,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,rate_acpe,rate_comp_trlr,rate_coll_trlr,generic_location_based_premium,location_effect,place_name_fips
0,1,66,31,250,487,23,692,26,30,6,7,75,68,3522,1.15,1
1,2,77,78,170,319,16,647,25,20,4,5,51,45,2914,0.95,2
2,3,74,37,333,319,31,449,17,20,4,10,100,45,2878,0.94,3
3,4,91,33,86,349,8,586,22,22,4,2,26,49,2556,0.83,4
4,5,91,33,171,535,16,883,34,33,6,5,51,75,3866,1.26,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,389,95,29,95,638,9,761,29,40,8,2,28,90,3648,1.19,389
19,390,82,33,131,556,12,1012,39,35,6,3,39,78,4052,1.32,390
20,391,83,38,372,315,35,746,28,19,3,11,112,44,3612,1.18,391
21,392,49,29,93,401,8,373,14,25,5,2,28,56,2166,0.71,392


## Drop out of state

In [16]:
is_in_state = df_processed_table["geography_factor_id"].str.isnumeric()
assert (~is_in_state).sum() == 1
df_processed_table = df_processed_table[is_in_state].copy()
df_processed_table["geography_factor_id"] = df_processed_table[
    "geography_factor_id"
].astype(int)

## Fill constants

In [17]:
df_processed_table["company"] = COMPANY_NAME
df_processed_table["vehicle_type"] = VEHICLE_TYPE
df_processed_table["factor_circumvented"] = FACTOR_CIRCUMVENTED
df_processed_table["factor_name"] = FACTOR_NAME
df_processed_table["geography_factor"] = GEOGRAPHY_FACTOR
df_processed_table["geography_type"] = GEOGRAPHY_TYPE

In [18]:
def fill_null_columns(df, base_columns):
    base_col_to_fill = list(set(base_columns) - set(df.columns))
    df[base_col_to_fill] = None
    return df


df_processed_table = fill_null_columns(df_processed_table, BASE_COLUMNS)
df_processed_table

Unnamed: 0,geography_factor_id,rate_bi,rate_pd_lpd_ppi,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,...,factor_circumvented,factor_name,geography_factor,geography_type,latitude,longitude,tract_fips,zip,block_group_fips,county_fips
0,1,66,31,250,487,23,692,26,30,6,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
1,2,77,78,170,319,16,647,25,20,4,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
2,3,74,37,333,319,31,449,17,20,4,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
3,4,91,33,86,349,8,586,22,22,4,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
4,5,91,33,171,535,16,883,34,33,6,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,388,63,42,137,297,13,472,18,18,3,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
18,389,95,29,95,638,9,761,29,40,8,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
19,390,82,33,131,556,12,1012,39,35,6,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,
20,391,83,38,372,315,35,746,28,19,3,...,zip,Vehicle Garaging Location Factor,Territory Code,place_name,,,,,,


In [19]:
DF_LOC_TABLE.to_csv("./outputs/rate_table.csv", index=False)

# Territory to location mappings

I removed Camelot as a requirement in the dev environment, so I import the previously extracted tables. I retain the code using Camelot to extract this data commented out in the [Appendix](#Appendix) section.

In [20]:
df_loc_to_territory = pd.read_csv("./outputs/loc_to_territory_mappings.csv")
df_locations_expanded = pd.read_csv("./outputs/loc_to_territory_m1_mappings.csv")

# Merge place name rate table

In [21]:
df_processed_table_merged_place_names = df_processed_table.merge(
    df_locations_expanded,
    left_on="geography_factor_id",
    right_index=True,
    validate="1:m",
)
df_processed_table_merged_place_names.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 21
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   geography_factor_id             392 non-null    int64  
 1   rate_bi                         392 non-null    int64  
 2   rate_pd_lpd_ppi                 392 non-null    int64  
 3   rate_comp                       392 non-null    int64  
 4   rate_coll                       392 non-null    int64  
 5   rate_loan                       392 non-null    int64  
 6   rate_me_wl_acr                  392 non-null    int64  
 7   rate_um_uim                     392 non-null    int64  
 8   rate_rent                       392 non-null    int64  
 9   rate_tow                        392 non-null    int64  
 10  rate_acpe                       392 non-null    int64  
 11  rate_comp_trlr                  392 non-null    int64  
 12  rate_coll_trlr                  392 non-nu

# Export data

In [22]:
df_loc_to_territory.to_csv("./outputs/loc_to_territory_mappings.csv", index=False)
df_locations_expanded.to_csv("./outputs/loc_to_territory_m1_mappings.csv")

In [23]:
expected_col_order = BASE_COLUMNS + rate_col_names
df_export = df_processed_table_merged_place_names[expected_col_order].copy()
df_export.to_csv(PROCESSED_EXPORT_FP, index=False)

df_export_num_only = df_processed_table[expected_col_order].copy()
df_export_num_only.to_csv(PROCESSED_NUM_ONLY_EXPORT_FP, index=False)

df_export_num_only

Unnamed: 0,company,vehicle_type,factor_circumvented,factor_name,geography_factor,geography_type,geography_factor_id,latitude,longitude,county_fips,...,rate_comp,rate_coll,rate_loan,rate_me_wl_acr,rate_um_uim,rate_rent,rate_tow,rate_acpe,rate_comp_trlr,rate_coll_trlr
0,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,1,,,,...,250,487,23,692,26,30,6,7,75,68
1,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,2,,,,...,170,319,16,647,25,20,4,5,51,45
2,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,3,,,,...,333,319,31,449,17,20,4,10,100,45
3,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,4,,,,...,86,349,8,586,22,22,4,2,26,49
4,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,5,,,,...,171,535,16,883,34,33,6,5,51,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,388,,,,...,137,297,13,472,18,18,3,4,41,42
18,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,389,,,,...,95,638,9,761,29,40,8,2,28,90
19,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,390,,,,...,131,556,12,1012,39,35,6,3,39,78
20,Progressive,Auto,zip,Vehicle Garaging Location Factor,Territory Code,place_name,391,,,,...,372,315,35,746,28,19,3,11,112,44


# Appendix

In [24]:
# tables = camelot.read_pdf(LOC_TO_TERRITORY_PDF_FP, pages="all", flavor="stream")
# column_names = list(tables[0].df.loc[1])
# print(f"Extracted column names: {column_names}")
# tables = [table.df[2:] for table in tables]
# df_loc_to_territory = pd.concat(tables).reset_index(drop=True)

# def join_to_succeeding_row(df, index):
#     df.iloc[index + 1][1] = " ".join(
#         [
#             df.iloc[index][1],
#             df.iloc[index + 1][1],
#         ]
#     )
#     df = df.drop(index)
#     return df


# split_rows_index = df_loc_to_territory[df_loc_to_territory[0] == ""].index
# for index in split_rows_index:
#     df_loc_to_territory = join_to_succeeding_row(df_loc_to_territory, index)

# df_loc_to_territory.columns = column_names
# # all caps to represent raw data copy of the extraction
# DF_LOC_TO_TERRITORY = df_loc_to_territory.copy()

# df_loc_to_territory.columns = [col.lower().replace(" ", "_") for col in column_names]
# df_loc_to_territory

# def split_at_comma(val):
#     return val.split(", ")


# df_locations_expanded = df_loc_to_territory.set_index("territory_code")
# df_locations_expanded["location_names"] = df_locations_expanded["location_names"].apply(
#     split_at_comma
# )
# df_locations_expanded = df_locations_expanded.explode("location_names")
# df_locations_expanded

# is_numeric_place_num = df_loc_to_territory["territory_code"].str.isnumeric()
# is_loc_name_dupe = df_locations_expanded["location_names"].duplicated(keep=False)

# assert len(df_loc_to_territory[~is_numeric_place_num]) == 0
# assert len(df_locations_expanded[is_loc_name_dupe]) == 0