In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import re

In [11]:
df = pd.read_csv("../cleaned_data/merged_data.csv")
df.head(5)

Unnamed: 0,County,State,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income,Mean income,Heart Disease Mortality,Sex,ethnicity
0,Park,Colorado,6987,1.6%,2.9%,10.4%,8.0%,7.0%,18.1%,15.5%,21.1%,10.5%,4.8%,76611,91905,135.9,Female,White
1,Walton,Florida,28635,5.9%,3.5%,8.6%,7.9%,12.3%,18.6%,13.8%,17.1%,5.6%,6.7%,67390,86296,126.5,Female,Asian
2,Whitfield,Georgia,36412,5.0%,4.6%,12.5%,12.3%,15.5%,20.4%,11.5%,11.9%,2.6%,3.7%,50055,67316,155.1,Female,Hispanic
3,Ida,Iowa,2972,7.4%,4.1%,9.1%,9.6%,16.6%,16.5%,16.4%,12.4%,4.5%,3.4%,54219,72389,168.0,Overall,Hispanic
4,Teton,Idaho,4290,2.1%,4.6%,7.2%,7.2%,8.5%,21.7%,9.3%,21.1%,15.3%,2.9%,73274,93860,191.9,Female,White


In [12]:

category_counts = df[["County","State","Sex","ethnicity"]].nunique()
print(category_counts)

County       1741
State          48
Sex             3
ethnicity       8
dtype: int64


In [13]:
county_encoder = LabelEncoder()
state_encoder = LabelEncoder()
median_income_scaler = MinMaxScaler()
mean_income_scaler = MinMaxScaler()
disease_scaler = MinMaxScaler()


def train_transform_features(df):
    df = df.copy()
    df["Median income"] = df["Median income"].str.replace(",", "").astype(float)
    df["Mean income"] = df["Mean income"].str.replace(",", "").astype(float)
    df["County_ID"] = county_encoder.fit_transform(df["County"])
    df["State_ID"] = state_encoder.fit_transform(df["State"])
    df["Median income"] = median_income_scaler.fit_transform(df[["Median income"]])
    df["Mean income"] = mean_income_scaler.fit_transform(df[["Mean income"]])
    df["Heart Disease Mortality"] = disease_scaler.fit_transform(df[["Heart Disease Mortality"]])
    
    return df[["Median income", "Mean income", "County_ID", "State_ID", "Heart Disease Mortality"]]

In [14]:
train_df = train_transform_features(df)
train_df.head(5)

Unnamed: 0,Median income,Mean income,County_ID,State_ID,Heart Disease Mortality
0,0.435182,0.389469,1194,4,0.068894
1,0.361307,0.351526,1639,7,0.064129
2,0.222426,0.223132,1679,8,0.078627
3,0.255786,0.25745,754,13,0.085167
4,0.408447,0.402694,1541,10,0.097283


In [15]:
X = train_df.drop(columns=["Heart Disease Mortality"]) 
y = train_df["Heart Disease Mortality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)

print("coefficients:", model.coef_)
print("intercept:", model.intercept_)

coefficients: [-5.68919268e-02 -1.03233285e-01 -9.69972871e-07  1.01711564e-04]
intercept: 0.21964051723104663


In [17]:

y_train_pred = model.predict(X_train)


y_test_pred = model.predict(X_test)


train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print("train mse:", train_mse)
print("test mse:", test_mse)

train mse: 0.00583071147348515
test mse: 0.00602250065583848


In [18]:

def restructure_csv_tabular(input_file):
 
    df = pd.read_csv(input_file)
    
   
    columns = df.columns.tolist()
    
   
    transformed_rows = []
    
   
    current_county = None
    current_state = None
    current_household_type = None
    
   
    for i, row in df.iterrows():
        first_column_value = str(row[columns[0]]).strip()
        
        
        county_match = re.match(r'(.+) County, (.+)', first_column_value)
        if county_match:
            current_county = county_match.group(1)
            current_state = county_match.group(2)
            continue
            
        
        if first_column_value in ['Households', 'Families', 'Married-couple families', 'Nonfamily households']:
            current_household_type = first_column_value
            continue
            
        
        if first_column_value == 'Estimate':
            
            new_row = {
                'County': current_county,
                'State': current_state,
                'Household_Type': current_household_type
            }
            
           
            for j in range(1, len(columns)):
               
                clean_col_name = columns[j].replace('Total!!', '').strip()
                new_row[clean_col_name] = row[columns[j]]
                
            transformed_rows.append(new_row)
    
    
    result_df = pd.DataFrame(transformed_rows)
    
    
    return result_df


df_future = restructure_csv_tabular("../raw_data/income_2021.csv")
df_future = df_future[df_future['Household_Type'] == 'Households']
df_future = df_future.reset_index(drop=True)
df_future.head(5)

Unnamed: 0,County,State,Household_Type,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars),PERCENT ALLOCATED,PERCENT ALLOCATED!!Household income in the past 12 months,PERCENT ALLOCATED!!Family income in the past 12 months,PERCENT ALLOCATED!!Nonfamily income in the past 12 months
0,Autauga,Alabama,Households,21856,5.5%,4.8%,10.9%,8.1%,12.1%,17.8%,13.9%,16.2%,6.0%,4.7%,62660,79498,,33.4%,(X),(X)
1,Baldwin,Alabama,Households,87190,4.9%,4.2%,7.0%,9.5%,13.4%,17.0%,14.6%,15.3%,7.0%,7.1%,64346,87709,,34.4%,(X),(X)
2,Barbour,Alabama,Households,9088,12.3%,7.1%,18.4%,10.7%,13.7%,16.2%,7.0%,9.3%,1.9%,3.3%,36422,55066,,48.7%,(X),(X)
3,Bibb,Alabama,Households,7083,7.5%,7.8%,11.6%,10.2%,10.7%,17.5%,16.8%,10.9%,3.7%,3.3%,54277,67396,,36.5%,(X),(X)
4,Blount,Alabama,Households,21300,8.4%,4.2%,9.5%,11.0%,13.0%,18.5%,11.7%,14.1%,5.8%,3.7%,52830,71849,,40.8%,(X),(X)


In [19]:
df_future = df_future[["Median income (dollars)", "Mean income (dollars)", "County", "State"]]
df_future["Median income"] = pd.to_numeric(df_future["Median income (dollars)"].str.replace(",", ""), errors="coerce")
df_future["Mean income"] = pd.to_numeric(df_future["Mean income (dollars)"].str.replace(",", ""), errors="coerce")
df_future.dropna(inplace=True)
df_future = df_future[["Median income","Mean income","County","State"]]
df_future.head(5)

Unnamed: 0,Median income,Mean income,County,State
0,62660.0,79498,Autauga,Alabama
1,64346.0,87709,Baldwin,Alabama
2,36422.0,55066,Barbour,Alabama
3,54277.0,67396,Bibb,Alabama
4,52830.0,71849,Blount,Alabama


In [20]:
def predict_transform_features(df):
    df = df.copy()
    df["County_ID"] = county_encoder.fit_transform(df["County"])
    df["State_ID"] = state_encoder.fit_transform(df["State"])
    df["Median income"] = median_income_scaler.transform(df[["Median income"]])
    df["Mean income"] = mean_income_scaler.transform(df[["Mean income"]])
    
    return df[["Median income", "Mean income", "County_ID", "State_ID"]]
    
df_future_test = predict_transform_features(df_future)
df_future_test.head(5)

Unnamed: 0,Median income,Mean income,County_ID,State_ID
0,0.323412,0.30554,74,0
1,0.33692,0.361084,80,0
2,0.113204,0.140265,90,0
3,0.256251,0.223674,137,0
4,0.244658,0.253797,151,0


In [21]:
y_future_pred = model.predict(df_future_test)

In [22]:
y_future_pred_original = disease_scaler.inverse_transform(y_future_pred.reshape(-1, 1))
df_future["Heart Disease Mortality"] = y_future_pred_original.round(1)
df_future.head(5)

Unnamed: 0,Median income,Mean income,County,State,Heart Disease Mortality
0,62660.0,79498,Autauga,Alabama,334.6
1,64346.0,87709,Baldwin,Alabama,321.8
2,36422.0,55066,Barbour,Alabama,391.8
3,54277.0,67396,Bibb,Alabama,358.7
4,52830.0,71849,Blount,Alabama,353.8


In [23]:
df_future.to_csv("../cleaned_data/pred_future_data.csv", index=False)