# Data dict template

Author: Mo Al Elew

**What notebook does/produces:**

Create a central data sheet containing the summary statistics for all of the insurers

**Issue/problem being solved:**

Current project structure separates each insurer's data into different directories, so it is difficult to summarize differences in rating systems 

**Strategy to solve:**

1. Read in each insurer's data set
2. Filter for relevant column(s)
3. Pull summary stats using Pandas methods
4. Join outputs and export

In [1]:
import geopandas as gpd
import pandas as pd

# Constants

In [2]:
PROCESSED_RATE_TABLE_FPS = [
    "../02_allstate/outputs/allstate_auto.csv",
    "../03_auto_club_group/outputs/autoclub_auto.csv",
    "../04_liberty_mutual/outputs/libertymutual_auto.csv",
    "../05_state_farm/outputs/statefarm_auto.csv",
    "../06_citizens/outputs/citizens_auto_clean.csv",
]

DATA_FPS = {
    "Allstate": "../02_allstate/outputs/allstate_auto_clean.geojson",
    "Auto Club": "../03_auto_club_group/outputs/autoclub_auto_clean.geojson",
    "State Farm": "../05_state_farm/outputs/statefarm_auto_clean_gis.zip",
    "Liberty Mutual": "../04_liberty_mutual/outputs/libertymutual_auto_clean.geojson",
    "Citizens": "../06_citizens/outputs/citizens_auto_clean.geojson",
}

UNPROCESSED_DATA_FPS = {
    "Allstate": "../02_allstate/outputs/auto_rate_table.csv",
    "Auto Club": "../03_auto_club_group/outputs/rate_table.csv",
    "State Farm": "../05_state_farm/outputs/rate_table.csv",
    "Liberty Mutual": "../04_liberty_mutual/outputs/rate_table.csv",
}

RACE_CHART_DATA_FPS = {
    "Allstate": "../02_allstate/outputs/allstate_race_chart_data.csv",
    "Auto Club": "../03_auto_club_group/outputs/auto_club_race_chart_data.csv",
    "Liberty Mutual": "../04_liberty_mutual/outputs/liberty_mutual_race_chart_data.csv",
    "State Farm": "../05_state_farm/outputs/state_farm_race_chart_data.csv",
    "Citizens": "../06_citizens/outputs/citizens_race_chart_data.csv",
}

INCOME_CHART_DATA_FPS = {
    "Allstate": "../02_allstate/outputs/allstate_income_chart_data.csv",
    "Liberty Mutual": "../04_liberty_mutual/outputs/liberty_mutual_income_chart_data.csv",
    "State Farm": "../05_state_farm/outputs/state_farm_income_chart_data.csv",
}

OUTPUT_FP = "./outputs/rate_dict.csv"
DATA_DICT_COLUMN_EXPORT = "./outputs/clean_columns.csv"

# Helper

In [3]:
def build_rate_data_dict(processed_rate_table_fp):
    DICT_COLS = [
        "company",
        "geography_type",
        "premium",
        "count",
        "mean",
        "std",
        "min",
        "25%",
        "50%",
        "75%",
        "max",
    ]
    PREMIUM_SUMMARY_COLS = ["location_effect", "generic_location_based_premium"]

    def get_company():
        return df["company"].unique()[0]

    def get_vehicle_type():
        return df["vehicle_type"].unique()[0]

    def get_georgraphy_type():
        return df["geography_type"].unique()[0]

    def get_rate_summaries(*summary_columns):
        df_summary = df[list(summary_columns)].describe()
        df_summary.columns = PREMIUM_SUMMARY_COLS
        return df_summary.transpose()

    df = pd.read_csv(processed_rate_table_fp)
    df_dict_template = pd.DataFrame(
        {"premium": PREMIUM_SUMMARY_COLS},
    )
    if processed_rate_table_fp != "../06_citizens/outputs/citizens_auto_clean.csv":
        df_dict_template["company"] = get_company()
        df_dict_template["geography_type"] = get_georgraphy_type()
    else:
        df_dict_template["company"] = "Citizens"
        df_dict_template["geography_type"] = "Census block group"

    df_dict_template = df_dict_template.merge(
        get_rate_summaries("location_effect", "generic_location_based_premium"),
        right_index=True,
        left_on="premium",
        validate="1:1",
    )

    return df_dict_template[DICT_COLS].copy()

In [4]:
build_rate_data_dict(PROCESSED_RATE_TABLE_FPS[0])

Unnamed: 0,company,geography_type,premium,count,mean,std,min,25%,50%,75%,max
0,Allstate,coordinate,location_effect,78015.0,1.038691,0.209085,0.67,0.91,1.0,1.15,3.31
1,Allstate,coordinate,generic_location_based_premium,78015.0,7234.447903,1456.735086,4694.24,6307.42,6965.24,7992.91,23022.58


In [5]:
data_dict_dfs = [build_rate_data_dict(fp) for fp in PROCESSED_RATE_TABLE_FPS]
df_data_dict = pd.concat(data_dict_dfs)
df_data_dict

Unnamed: 0,company,geography_type,premium,count,mean,std,min,25%,50%,75%,max
0,Allstate,coordinate,location_effect,78015.0,1.038691,0.209085,0.67,0.91,1.0,1.15,3.31
1,Allstate,coordinate,generic_location_based_premium,78015.0,7234.447903,1456.735086,4694.24,6307.42,6965.24,7992.91,23022.58
0,Auto Club,census_tract,location_effect,2813.0,1.12085,0.418771,0.64,0.88,1.0,1.13,2.81
1,Auto Club,census_tract,generic_location_based_premium,2813.0,8322.318877,3107.730512,4788.0,6568.0,7425.0,8386.0,20890.0
0,Liberty Mutual,county,location_effect,83.0,1.02241,0.116792,0.71,0.95,1.0,1.09,1.46
1,Liberty Mutual,county,generic_location_based_premium,83.0,44286.26506,5052.394093,30918.0,41302.0,43319.0,47054.0,63266.0
0,State Farm,coordinate,location_effect,176725.0,1.018653,0.179483,0.67,0.94,1.0,1.07,4.55
1,State Farm,coordinate,generic_location_based_premium,176725.0,6290.063234,1108.224626,4153.92,5781.64,6175.08,6610.69,28071.22
0,Citizens,Census block group,location_effect,8159.0,1.062984,0.254792,0.53,0.89,1.0,1.15,1.84
1,Citizens,Census block group,generic_location_based_premium,8159.0,32521.249673,7795.349505,16112.04,27316.98,30594.62,35230.91,56386.08


In [6]:
is_generic_loc_rate = df_data_dict["premium"] == "generic_location_based_premium"
df_data_dict[is_generic_loc_rate].to_csv("./outputs/loc_rate_summary.csv", index=False)
df_data_dict[is_generic_loc_rate]

Unnamed: 0,company,geography_type,premium,count,mean,std,min,25%,50%,75%,max
1,Allstate,coordinate,generic_location_based_premium,78015.0,7234.447903,1456.735086,4694.24,6307.42,6965.24,7992.91,23022.58
1,Auto Club,census_tract,generic_location_based_premium,2813.0,8322.318877,3107.730512,4788.0,6568.0,7425.0,8386.0,20890.0
1,Liberty Mutual,county,generic_location_based_premium,83.0,44286.26506,5052.394093,30918.0,41302.0,43319.0,47054.0,63266.0
1,State Farm,coordinate,generic_location_based_premium,176725.0,6290.063234,1108.224626,4153.92,5781.64,6175.08,6610.69,28071.22
1,Citizens,Census block group,generic_location_based_premium,8159.0,32521.249673,7795.349505,16112.04,27316.98,30594.62,35230.91,56386.08


# Appendix

Calculate largest gap between min and max effect

In [7]:
df_data_dict = df_data_dict[is_generic_loc_rate].copy()
df_data_dict["max_div_min"] = df_data_dict["max"] / df_data_dict["min"]
df_data_dict.sort_values(["max_div_min"])

Unnamed: 0,company,geography_type,premium,count,mean,std,min,25%,50%,75%,max,max_div_min
1,Liberty Mutual,county,generic_location_based_premium,83.0,44286.26506,5052.394093,30918.0,41302.0,43319.0,47054.0,63266.0,2.046251
1,Citizens,Census block group,generic_location_based_premium,8159.0,32521.249673,7795.349505,16112.04,27316.98,30594.62,35230.91,56386.08,3.499624
1,Auto Club,census_tract,generic_location_based_premium,2813.0,8322.318877,3107.730512,4788.0,6568.0,7425.0,8386.0,20890.0,4.362991
1,Allstate,coordinate,generic_location_based_premium,78015.0,7234.447903,1456.735086,4694.24,6307.42,6965.24,7992.91,23022.58,4.904432
1,State Farm,coordinate,generic_location_based_premium,176725.0,6290.063234,1108.224626,4153.92,5781.64,6175.08,6610.69,28071.22,6.757766


## Column headers export

In [8]:
def read_data_header(fp, read_func=pd.read_csv):
    data = read_func(fp, nrows=1)
    return list(data.columns)


columns = set()
for fp in DATA_FPS.values():
    columns = columns | set(read_data_header(fp, gpd.read_file))

columns

{'bg_black_pct',
 'bg_black_tot',
 'bg_geo_id',
 'bg_median_income',
 'bg_tot_pop',
 'bg_white_pct',
 'bg_white_tot',
 'black_pct',
 'black_tot',
 'density',
 'generic_location_based_premium',
 'geo_id',
 'geo_name',
 'geometry',
 'is_along_8_mile',
 'is_in_detroit',
 'is_north_8_mile',
 'is_south_8_mile',
 'is_zcta_border',
 'latitude',
 'loc_rate_div_min_nn',
 'location_effect',
 'longitude',
 'median_income',
 'nn_min_val',
 'tot_pop',
 'total_pop',
 'white_pct',
 'white_tot'}

In [9]:
pd.Series(list(columns)).sort_values().to_csv(DATA_DICT_COLUMN_EXPORT)