# INST414 Final Project Sprint 2

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datascience import *
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


## Data Cleaning

In [None]:
# select columns to read and keep
cols_to_keep = ['Data_Value', 'Race/Ethnicity', 'Age(months)', 'Sex', 'LocationDesc', 'LocationAbbr', 'YearStart', 'YearEnd', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'GeoLocation']
df = pd.read_csv("/Users/virginialee/Downloads/WIC_data.csv", usecols=cols_to_keep)

df.head()
df.shape

In [None]:
# rename columns to be more intuitive
df.rename(columns={
    'Data_Value': 'pct_overweight',
    'Race/Ethnicity': 'race',
    'Age(months)': 'age_months'
}, inplace=True)
df.head()

In [None]:
# checking missing values
missing_value = df.isna().sum()
print(missing_value)

In [None]:
# calculate % missing for key columns
379/12852 # 2.9% missing for data_value
379/12852 # 2.9% missing for low_confidence_limit
379/12852 # 2.9% missing for high_confidence_limit
379/12852 # 2.9% missing for sample_size
7182/12852 # 55.88% missing for race/ethnicity
9072/12852 # 70.59% missing for age
10584/12852 # 82.35% missing for sex

In [None]:
# dropping missing values and unnecessary columns
clean_df = df.dropna(subset=['pct_overweight', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size'])
clean_df = clean_df.drop(columns=['Sex'])
clean_df.head()
clean_df.shape

In [None]:
# filter race by American Indian/Alaska Native and Non-Hispanic White
df_race = clean_df[clean_df['race'].isin(['American Indian/Alaska Native', 'Non-Hispanic White'])]
df_race.shape
df_race.head()

In [None]:
# duplicates in data set
print(df_race.duplicated().value_counts()) # how many rows are duplicates
df_race[df_race.duplicated(keep=False)] # displays duplicate rows
# drop duplicates
df_race = df_race.drop_duplicates()
df_race.shape

In [None]:
# relabel values to be more intuitive
df_race = df_race.replace({
    'American Indian/Alaska Native': 'Indigenous',
    'Non-Hispanic White': 'White'
}, inplace=False)
df_race.head()

In [None]:
# turn race into boolean variable
df_race.drop(columns='Indigenous', errors='ignore', inplace=True)  # drop if already exists
df_race['Indigenous'] = (df_race['race'] == 'Indigenous').astype(int)
df_race.head()

## Baseline Linear Regression Model

In [None]:
# Fit the linear regression model using sklearn
X = df_race[['Indigenous']]
y = df_race['pct_overweight']
reg = LinearRegression()
reg.fit(X, y)

In [None]:
reg.coef_  # coefficient for Indigenous variable
reg.intercept_  # intercept
reg.score(X, y)  # R-squared value
print("Coefficient for Indigenous:", reg.coef_[0], "Intercept:", reg.intercept_, "R-squared:", reg.score(X, y))

In [None]:
# using statsmodels to get p-value
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

## Primary Model

In [None]:
# Independent Variables
df_race['Sample_Size'] = df_race['Sample_Size'].str.replace(',', '', regex=True)
df_race['Sample_Size'] = pd.to_numeric(df_race['Sample_Size'], errors='coerce')
df_race['pct_overweight'] = pd.to_numeric(df_race['pct_overweight'], errors='coerce')


In [100]:
# age variable
df_race['age_months'] = df_race['age_months'].fillna('Missing')
age_dummy = pd.get_dummies(df_race['age_months'], prefix='age', drop_first=True)
age_dummy = age_dummy.astype(int)

# location dummies
# location_dummy = pd.get_dummies(df_race['LocationAbbr'], prefix='loc', drop_first=True)
# location_dummy = location_dummy.astype(int)

X = pd.concat([df_race['Indigenous'], age_dummy], axis=1)
X = sm.add_constant(X)

# Dependent Variables
y = df_race['pct_overweight']

# Fit weighted linear regression using sample sizes as weights
model = sm.WLS(y, X, weights=df_race['Sample_Size'])
results = model.fit()

print(results.summary())

                            WLS Regression Results                            
Dep. Variable:         pct_overweight   R-squared:                       0.117
Model:                            WLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     268.2
Date:                Sat, 29 Nov 2025   Prob (F-statistic):           1.08e-56
Time:                        16:51:51   Log-Likelihood:                -6365.1
No. Observations:                2021   AIC:                         1.273e+04
Df Residuals:                    2019   BIC:                         1.275e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.2658      0.063    210.060      0.0