# INST414 Final Project Sprint 2

In [115]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datascience import *
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

## Data Cleaning

In [162]:
# select columns to read and keep
cols_to_keep = ['Data_Value', 'Race/Ethnicity', 'Age(months)', 'Sex', 'LocationDesc', 'LocationAbbr', 'YearStart', 'YearEnd', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'GeoLocation']
df = pd.read_csv("/Users/virginialee/Downloads/WIC_data.csv", usecols=cols_to_keep)

df.head()
df.shape

(12852, 12)

In [163]:
# rename columns to be more intuitive
df.rename(columns={
    'Data_Value': 'pct_overweight',
    'Race/Ethnicity': 'race',
    'Age(months)': 'age_months'
}, inplace=True)
df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,pct_overweight,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,age_months,Sex,race,GeoLocation
0,2008,2008,AL,Alabama,15.3,14.7,15.8,18219,24 - 35,,,"(32.84057112200048, -86.63186076199969)"
1,2008,2008,AL,Alabama,14.9,14.4,15.5,14796,36 - 47,,,"(32.84057112200048, -86.63186076199969)"
2,2008,2008,AL,Alabama,16.4,15.6,17.1,10272,48 - 59,,,"(32.84057112200048, -86.63186076199969)"
3,2008,2008,AL,Alabama,25.0,19.3,30.7,228,,,American Indian/Alaska Native,"(32.84057112200048, -86.63186076199969)"
4,2008,2008,AL,Alabama,8.8,5.4,12.2,273,,,Asian/Pacific Islander,"(32.84057112200048, -86.63186076199969)"


In [164]:
# checking missing values
missing_value = df.isna().sum()
print(missing_value)

YearStart                    0
YearEnd                      0
LocationAbbr                 0
LocationDesc                 0
pct_overweight             379
Low_Confidence_Limit       379
High_Confidence_Limit      379
Sample_Size                379
age_months                9072
Sex                      10584
race                      7182
GeoLocation                  0
dtype: int64


In [165]:
# calculate % missing for key columns
379/12852 # 2.9% missing for data_value
379/12852 # 2.9% missing for low_confidence_limit
379/12852 # 2.9% missing for high_confidence_limit
379/12852 # 2.9% missing for sample_size
7182/12852 # 55.88% missing for race/ethnicity
9072/12852 # 70.59% missing for age
10584/12852 # 82.35% missing for sex

0.8235294117647058

In [166]:
# dropping missing values and unnecessary columns
clean_df = df.dropna(subset=['pct_overweight', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size'])
clean_df = clean_df.drop(columns=['Sex'])
clean_df.head()
clean_df.shape

(12473, 11)

In [167]:
# filter race by American Indian/Alaska Native and Non-Hispanic White
df_race = clean_df[clean_df['race'].isin(['American Indian/Alaska Native', 'Non-Hispanic White'])]
df_race.shape
df_race.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,pct_overweight,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,age_months,race,GeoLocation
3,2008,2008,AL,Alabama,25.0,19.3,30.7,228,,American Indian/Alaska Native,"(32.84057112200048, -86.63186076199969)"
9,2008,2008,AL,Alabama,15.8,15.2,16.3,17833,,Non-Hispanic White,"(32.84057112200048, -86.63186076199969)"
14,2008,2008,AK,Alaska,23.7,22.2,25.2,3190,,American Indian/Alaska Native,"(64.84507995700051, -147.72205903599973)"
20,2008,2008,AK,Alaska,15.5,14.3,16.7,3540,,Non-Hispanic White,"(64.84507995700051, -147.72205903599973)"
25,2008,2008,AZ,Arizona,20.1,17.7,22.4,1101,,American Indian/Alaska Native,"(34.865970280000454, -111.76381127699972)"


In [168]:
# duplicates in data set
print(df_race.duplicated().value_counts()) # how many rows are duplicates
df_race[df_race.duplicated(keep=False)] # displays duplicate rows
# drop duplicates
df_race = df_race.drop_duplicates()
df_race.shape

False    2021
True        8
Name: count, dtype: int64


(2021, 11)

In [169]:
# relabel values to be more intuitive
df_race = df_race.replace({
    'American Indian/Alaska Native': 'Indigenous',
    'Non-Hispanic White': 'White'
}, inplace=False)
df_race.head()

  df_race = df_race.replace({


Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,pct_overweight,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,age_months,race,GeoLocation
3,2008,2008,AL,Alabama,25.0,19.3,30.7,228,,Indigenous,"(32.84057112200048, -86.63186076199969)"
9,2008,2008,AL,Alabama,15.8,15.2,16.3,17833,,White,"(32.84057112200048, -86.63186076199969)"
14,2008,2008,AK,Alaska,23.7,22.2,25.2,3190,,Indigenous,"(64.84507995700051, -147.72205903599973)"
20,2008,2008,AK,Alaska,15.5,14.3,16.7,3540,,White,"(64.84507995700051, -147.72205903599973)"
25,2008,2008,AZ,Arizona,20.1,17.7,22.4,1101,,Indigenous,"(34.865970280000454, -111.76381127699972)"


In [171]:
# turn race into boolean variable
df_race.drop(columns='Indigenous', errors='ignore', inplace=True)  # drop if already exists
df_race['Indigenous'] = (df_race['race'] == 'Indigenous').astype(int)
df_race.head()
df_race.to_csv('cleaned_wic_data.csv', index=False)

## Baseline Linear Regression Model

In [None]:
# Fit the linear regression model using sklearn
X = df_race[['Indigenous']]
y = df_race['pct_overweight']
reg = LinearRegression()
reg.fit(X, y)

In [None]:
reg.coef_  # coefficient for Indigenous variable
reg.intercept_  # intercept
reg.score(X, y)  # R-squared value
print("Coefficient for Indigenous:", reg.coef_[0], "Intercept:", reg.intercept_, "R-squared:", reg.score(X, y))

In [None]:
# using statsmodels to get p-value
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

## Primary Model

In [None]:
# Independent Variables
df_race['Sample_Size'] = df_race['Sample_Size'].str.replace(',', '', regex=True)
df_race['Sample_Size'] = pd.to_numeric(df_race['Sample_Size'], errors='coerce')
df_race['pct_overweight'] = pd.to_numeric(df_race['pct_overweight'], errors='coerce')
X = sm.add_constant(X)

# Dependent Variables
y = df_race['pct_overweight']

# Fit weighted linear regression using sample sizes as weights
model = sm.WLS(y, X, weights=df_race['Sample_Size'])
results = model.fit()

print(results.summary())

                            WLS Regression Results                            
Dep. Variable:         pct_overweight   R-squared:                       0.117
Model:                            WLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     268.2
Date:                Sat, 29 Nov 2025   Prob (F-statistic):           1.08e-56
Time:                        18:48:55   Log-Likelihood:                -6365.1
No. Observations:                2021   AIC:                         1.273e+04
Df Residuals:                    2019   BIC:                         1.275e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.2658      0.063    210.060      0.0

### Robustness Check

In [104]:
# different subsample - years 2014-2020
df_years = df_race[df_race['YearStart'] >= 2014]
df_years.head()

# age variable
df_years['age_months'] = df_years['age_months'].fillna('Missing')
age_dummy = pd.get_dummies(df_years['age_months'], prefix='age', drop_first=True)
age_dummy = age_dummy.astype(int)

X = pd.concat([df_years['Indigenous'], age_dummy], axis=1)
X = sm.add_constant(X)
y = df_years['pct_overweight']

model = sm.WLS(y, X, weights=df_years['Sample_Size'])
results = model.fit()

print(results.summary())

                            WLS Regression Results                            
Dep. Variable:         pct_overweight   R-squared:                       0.104
Model:                            WLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     132.0
Date:                Sat, 29 Nov 2025   Prob (F-statistic):           5.63e-29
Time:                        17:21:44   Log-Likelihood:                -3534.8
No. Observations:                1137   AIC:                             7074.
Df Residuals:                    1135   BIC:                             7084.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.9738      0.083    156.262      0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_years['age_months'] = df_years['age_months'].fillna('Missing')


## Model Evaluation and Diagnostics

In [None]:
# checking for independence of residuals
residuals = results.resid

plt.figure(figsize=(8,6))
plt.plot(residuals)
plt.title('Residuals vs Observation from Weighted Linear Regression Model')
plt.xlabel('Observation')
plt.ylabel('Residuals')
plt.savefig('residuals_weighted_linear_regression.png')

In [112]:
# checking for homoskedasticity
fitted = results.fittedvalues

plt.figure(figsize=(8,6))
plt.scatter(fitted, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.savefig('residuals_vs_fitted_weighted_linear_regression.png')

In [114]:
# checking for normality of residuals
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals from Weighted Linear Regression Model')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('histogram_residuals_weighted_linear_regression.png')

In [158]:
# checking for multicollinearity
df_mc = df_race.copy()

# fill in missing age and create dummies
df_mc['age_months'] = df_mc['age_months'].fillna('Missing')
age_dummy = pd.get_dummies(df_mc['age_months'], prefix='age', drop_first=False)
age_dummy = age_dummy.astype(int)
print(df_mc['age_months'].value_counts())

age_months
Missing    2021
Name: count, dtype: int64


In [157]:
X = pd.concat([df_mc['Indigenous'], age_dummy], axis=1)
X = sm.add_constant(X)

X_numeric = X.drop(columns='const', errors='ignore') 


corr = X_numeric.corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Independent Variables')
plt.savefig('correlation_matrix_predictors.png') 

In [161]:
df_race.to_csv('cleaned_wic_data.csv', index=False)