# Data Cleaning

In [1]:
import os

import pandas as pd

In [2]:
primary = pd.read_csv(
    "data/allcauseunplanned30-dayhospitalreadmissionratecalifornia2011_2023.csv"
)
primary.head()

Unnamed: 0,Year,Strata,Strata Name,County,Total Admits (ICD-9),30-day Readmits (ICD-9),30-day Readmission Rate (ICD-9),Total Admits (ICD-10),30-day Readmits (ICD-10),30-day Readmission Rate (ICD-10)
0,2011,Overall,Overall,State,1948641.0,272268.0,14%,,,
1,2011,Age,18 to 44 years,State,326070.0,36855.0,11.30%,,,
2,2011,Age,45 to 64 years,State,659801.0,90891.0,13.80%,,,
3,2011,Age,65 years and above,State,962771.0,144522.0,15%,,,
4,2011,Sex,Male,State,901776.0,132417.0,14.70%,,,


In [3]:
primary = primary[(primary["Strata"] == "Overall") & (primary["County"] != "State")]
primary

Unnamed: 0,Year,Strata,Strata Name,County,Total Admits (ICD-9),30-day Readmits (ICD-9),30-day Readmission Rate (ICD-9),Total Admits (ICD-10),30-day Readmits (ICD-10),30-day Readmission Rate (ICD-10)
17,2011,Overall,Overall,Alameda,75103.0,11377.0,15.10%,,,
18,2011,Overall,Overall,Alpine,13.0,1.0,7.70%,,,
19,2011,Overall,Overall,Amador,2657.0,345.0,13%,,,
20,2011,Overall,Overall,Butte,20422.0,3198.0,15.70%,,,
21,2011,Overall,Overall,Calaveras,3253.0,392.0,12.10%,,,
...,...,...,...,...,...,...,...,...,...,...
976,2023,Overall,Overall,Tulare,,,,17918,2483,13.86%
977,2023,Overall,Overall,Tuolumne,,,,3251,485,14.92%
978,2023,Overall,Overall,Ventura,,,,35807,5244,14.65%
979,2023,Overall,Overall,Yolo,,,,7760,1195,15.40%


In [5]:
columns_to_drop = [
    "Strata",
    "Strata Name",
    "Total Admits (ICD-10)",
    "30-day Readmits (ICD-10) ",
    "30-day Readmission Rate (ICD-10)",
]
primary_pre2016 = primary[primary["Year"] < 2016]
primary_pre2016 = primary_pre2016.drop(columns=columns_to_drop)
new_column_names = {
    "Total Admits (ICD-9)": "Total_Admits",
    "30-day Readmits (ICD-9) ": "30_day_Readmits",
    "30-day Readmission Rate (ICD-9)": "30_day_Readmission_Rate",
}
primary_pre2016 = primary_pre2016.rename(columns=new_column_names)

primary_pre2016

Unnamed: 0,Year,County,Total_Admits,30_day_Readmits,30_day_Readmission_Rate
17,2011,Alameda,75103.0,11377.0,15.10%
18,2011,Alpine,13.0,1.0,7.70%
19,2011,Amador,2657.0,345.0,13%
20,2011,Butte,20422.0,3198.0,15.70%
21,2011,Calaveras,3253.0,392.0,12.10%
...,...,...,...,...,...
370,2015,Tulare,16424.0,2243.0,13.66%
371,2015,Tuolumne,3144.0,410.0,13.04%
372,2015,Ventura,30175.0,3862.0,12.80%
373,2015,Yolo,5852.0,726.0,12.41%


In [6]:

columns_to_drop_2016 = [
    "Strata",
    "Strata Name",
    "Total Admits (ICD-9)",
    "30-day Readmits (ICD-9) ",
    "30-day Readmission Rate (ICD-9)",
]
primary_after2016 = primary[primary["Year"] >= 2016]
primary_after2016 = primary_after2016.drop(columns=columns_to_drop_2016)
new_column_names_2016 = {
    "Total Admits (ICD-10)": "Total_Admits",
    "30-day Readmits (ICD-10) ": "30_day_Readmits",
    "30-day Readmission Rate (ICD-10)": "30_day_Readmission_Rate",
}
primary_after2016 = primary_after2016.rename(columns=new_column_names_2016)
primary_after2016

Unnamed: 0,Year,County,Total_Admits,30_day_Readmits,30_day_Readmission_Rate
392,2016,Alameda,65203,9759,14.97%
393,2016,Alpine,21,1,4.76%
394,2016,Amador,2814,390,13.86%
395,2016,Butte,22675,4041,17.82%
396,2016,Calaveras,2868,400,13.95%
...,...,...,...,...,...
976,2023,Tulare,17918,2483,13.86%
977,2023,Tuolumne,3251,485,14.92%
978,2023,Ventura,35807,5244,14.65%
979,2023,Yolo,7760,1195,15.40%


In [8]:
cleaned_primary = pd.concat([primary_pre2016, primary_after2016], ignore_index=True)
cleaned_primary = cleaned_primary[cleaned_primary["Year"].isin([2016, 2020])]
cleaned_primary.drop(cleaned_primary[cleaned_primary['County'] == 'Unknown County'].index, inplace=True)
cleaned_primary

Unnamed: 0,Year,County,Total_Admits,30_day_Readmits,30_day_Readmission_Rate
290,2016,Alameda,65203,9759,14.97%
291,2016,Alpine,21,1,4.76%
292,2016,Amador,2814,390,13.86%
293,2016,Butte,22675,4041,17.82%
294,2016,Calaveras,2868,400,13.95%
...,...,...,...,...,...
576,2020,Tulare,18786,2653,14.12%
577,2020,Tuolumne,3304,474,14.35%
578,2020,Ventura,34820,4945,14.20%
579,2020,Yolo,6945,976,14.05%


In [9]:
cleaned_primary.to_csv("data/cleaned_data/primary.csv", index=False)

In [22]:
presidential = pd.read_csv('data/presidential.csv')
presidential

Unnamed: 0,Year,County,DEM,DEM(%),REO,REP(%),Win
0,2016,Alameda,514842,78.7,95922,14.7,Blue
1,2016,Alpine,334,55.5,217,36.0,Blue
2,2016,Amador,6004,33.9,10485,59.1,Red
3,2016,Butte,41567,43.5,45144,47.2,Red
4,2016,Calaveras,7944,34.3,13511,58.4,Red
...,...,...,...,...,...,...,...
113,2020,Tuolumne,11978,39.4,17689,58.2,Red
114,2020,Ventura,251388,59.5,162207,38.4,Blue
115,2020,Yolo,67598,69.5,27292,28.1,Blue
116,2020,Yuba,11230,37.7,17676,59.3,Red


In [23]:
# Correct errors in original data
presidential = presidential.rename(columns={"REO": "REP"})
presidential

Unnamed: 0,Year,County,DEM,DEM(%),REP,REP(%),Win
0,2016,Alameda,514842,78.7,95922,14.7,Blue
1,2016,Alpine,334,55.5,217,36.0,Blue
2,2016,Amador,6004,33.9,10485,59.1,Red
3,2016,Butte,41567,43.5,45144,47.2,Red
4,2016,Calaveras,7944,34.3,13511,58.4,Red
...,...,...,...,...,...,...,...
113,2020,Tuolumne,11978,39.4,17689,58.2,Red
114,2020,Ventura,251388,59.5,162207,38.4,Blue
115,2020,Yolo,67598,69.5,27292,28.1,Blue
116,2020,Yuba,11230,37.7,17676,59.3,Red


In [24]:
presidential.loc[63,'DEM(%)'] = 37.0
presidential.loc[63, 'Win'] = 'Red'
presidential.loc[63]

Year           2020
County    Calaveras
DEM           10046
DEM(%)         37.0
REP           16518
REP(%)         60.8
Win             Red
Name: 63, dtype: object

In [25]:
presidential.loc[70,'DEM(%)'] = 65.0
presidential.loc[70]

Year          2020
County    Humboldt
DEM          44768
DEM(%)        65.0
REP          21770
REP(%)        31.6
Win           Blue
Name: 70, dtype: object

In [26]:
presidential.loc[77,'DEM(%)'] = 71.0
presidential.loc[77]

Year             2020
County    Los Angeles
DEM           3028885
DEM(%)           71.0
REP           1145530
REP(%)           26.9
Win              Blue
Name: 77, dtype: object

In [27]:
presidential.loc[86,'DEM(%)'] = 69.0
presidential.loc[86]

Year       2020
County     Napa
DEM       49817
DEM(%)     69.0
REP       20676
REP(%)     28.7
Win        Blue
Name: 86, dtype: object

In [28]:
presidential.loc[91,'DEM(%)'] = 53.0
presidential.loc[91, 'REP(%)'] = 45.0
presidential.loc[91]

Year           2020
County    Riverside
DEM          528340
DEM(%)         53.0
REP          449144
REP(%)         45.0
Win            Blue
Name: 91, dtype: object

In [29]:
presidential.loc[97,'REP(%)'] = 42.0
presidential.loc[97, 'Win'] = 'Blue'
presidential.loc[97]

Year             2020
County    San Joaquin
DEM            161137
DEM(%)           55.9
REP            121098
REP(%)           42.0
Win              Blue
Name: 97, dtype: object

In [30]:
presidential.loc[107,'REP(%)'] = 23.0
presidential.loc[107, 'Win'] = 'Blue'
presidential.loc[107]

Year        2020
County    Sonoma
DEM       199938
DEM(%)      74.5
REP        61825
REP(%)      23.0
Win         Blue
Name: 107, dtype: object

In [31]:
presidential.loc[110,'DEM(%)'] = 31.0
presidential.loc[110, 'Win'] = 'Red'
presidential.loc[110]

Year        2020
County    Tehama
DEM         8911
DEM(%)      31.0
REP        19141
REP(%)      66.6
Win          Red
Name: 110, dtype: object

In [32]:
presidential.loc[112,'DEM(%)'] = 45.0
presidential.loc[112, 'Win'] = 'Red'
presidential.loc[112]

Year        2020
County    Tulare
DEM        66105
DEM(%)      45.0
REP        77579
REP(%)      52.8
Win          Red
Name: 112, dtype: object

In [33]:
presidential.to_csv("data/cleaned_data/presidentialResult.csv", index=False)