# DS4PH Term 4 Capstone Project
Group Members: Tenzin Lhaksampa and Archana Balan  
Code to fit a prediction model per maltreatment type

In [3]:
# Import packages
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [12]:
# read in data
dat = pd.read_excel("./data/maltreatment_data.xlsx")
dat.head()

Unnamed: 0,LocationType,Location,LocationCode,Category,TimeFrame,DataFormat,Data
0,State,Alabama,AL,Emotional abuse,2021,Number,19
1,State,Alabama,AL,Medical neglect,2021,Number,79
2,State,Alabama,AL,Neglect,2021,Number,5011
3,State,Alabama,AL,Other/missing maltreatment type,2021,Number,N.R.
4,State,Alabama,AL,Physical abuse,2021,Number,6062


In [5]:
# Preprocess data

# only retain numbers (not percentages)
dat = dat[dat['DataFormat'] == 'Number']

# convert cases reported to a numeric value
dat['Data'] = pd.to_numeric(dat['Data'], errors='coerce')

# drop NA values
dat = dat.dropna()

# only retain 4 required columns
dat = dat[['Location', 'Category', 'TimeFrame', 'Data']]

# combine Medical Neglect and Neglect into a single category called : Neglect
new_mt = dat[dat['Category'].isin([ 'Medical neglect', 'Neglect'])].groupby(['Location', 'TimeFrame']).sum().reset_index()
new_mt['Category'] = 'Total neglect'

# add in the new category
dat = pd.concat([dat, new_mt], ignore_index=True)

# drop categories neglect, medical neglect and other missing type
# drop  category with very high missing values
dat = dat[~dat['Category'].isin(['Medical neglect', 'Neglect','Other/missing maltreatment type'])]

# rename Total neglect category
dat['Category'] = dat['Category'].replace('Total neglect', 'Neglect')

# drop 'United States' from data to keep only state-wise data
dat = dat[dat['Location'] != 'United States']


# Time lag regression model
Fitting a time lag regression model for each of the 4 maltreatment types.   
Each model will predict the future number of cases based on data from the past 7 years. 

## Training Testing Split
To maintain consistency across models we fix the training-testing split for all models. Randomly fix 80% of the states to beused as training and remaining 20% as testing. 

In [6]:
# Select 80% of states to be included in the training set 
# remaining 20% to be used as tesing set

# Define the proportion of data to allocate for testing
test_size = 0.2 

# list of states
states = dat['Location'].unique()

# Randomly select states for testing
states_train, states_test = train_test_split(states, test_size=test_size, random_state=42)



In [7]:
dat['Category'].unique()

array(['Emotional abuse', 'Physical abuse', 'Sexual abuse', 'Neglect'],
      dtype=object)

## Training the models
Iteratively for each maltreatment type prepare the features matrix : data from 2015-2021 and the target variable which will be cases from 2022. After the model is trained compute mean squared error on the testing set

In [8]:
models_list = {}

# Define the target year for the model
target_year = 2022

# Define the start year of available data
start_year = 2015

for mt in dat['Category'].unique():
    
    print ("Fitting model for " + mt)
    
    # subset data for type
    mt_data = dat[dat['Category'] == mt]

    # Pivot the data to have years as columns and maltreatment counts as values
    mt_data_pivot = mt_data.pivot(index='Location', columns='TimeFrame', values='Data').reset_index()

    # Drop NA values 
    mt_data_pivot = mt_data_pivot.dropna()

    # prepare target variable
    Ytrain = mt_data_pivot[mt_data_pivot['Location'].isin(states_train)]
    Ytrain = Ytrain.loc[:, 2022]

    Ytest = mt_data_pivot[mt_data_pivot['Location'].isin(states_test)]
    Ytest = Ytest.loc[:, 2022]

    # prepare feature matrix 
    Xtrain = mt_data_pivot[mt_data_pivot['Location'].isin(states_train)]
    Xtrain = Xtrain.drop(columns=['Location', 2022])
    Xtrain.columns =  [f'{mt}_lag_{target_year - year}' for year in Xtrain.columns[0:]]

    Xtest = mt_data_pivot[mt_data_pivot['Location'].isin(states_test)]
    Xtest = Xtest.drop(columns=['Location', 2022])
    Xtest.columns =  [f'{mt}_lag_{target_year - year}' for year in Xtest.columns[0:]]

    model = LinearRegression()
    model.fit(Xtrain, Ytrain)

    models_list[mt] = {'model': model}

         # Make predictions on the test data
    Ypred = model.predict(Xtest)

        # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(Ytest, Ypred)

        # Save MSE in the dictionary
    models_list[mt]['mse'] = mse


Fitting model for Emotional abuse
Fitting model for Physical abuse
Fitting model for Sexual abuse
Fitting model for Neglect


## Display Mean Squared Errors from each model


In [9]:
models_list

{'Emotional abuse': {'model': LinearRegression(), 'mse': 139947.48088496178},
 'Physical abuse': {'model': LinearRegression(), 'mse': 290293.4474370572},
 'Sexual abuse': {'model': LinearRegression(), 'mse': 19865.312615081097},
 'Neglect': {'model': LinearRegression(), 'mse': 749155.9167911987}}

# Predict Future Values
For each state predict the future number of cases in 2023 using data from 2016-2022. Update teh feature matrix with predicted values to then predict data for 2024 and then 2025.

In [10]:
predicted_list = {}

for mt in dat['Category'].unique():
    # subset data for type
    mt_data = dat[dat['Category'] == mt]

    # Pivot the data to have years as columns and maltreatment counts as values
    mt_data_pivot = mt_data.pivot(index='Location', columns='TimeFrame', values='Data').reset_index()

    # Drop NA values 
    mt_data_pivot = mt_data_pivot.dropna()


    for target_year in range(2023,2026):
        feature_years = [target_year - x for x in range(1,8)]

        # set up feature matrix
        Xpred = mt_data_pivot[feature_years]
        Xpred.columns =  [f'{mt}_lag_{target_year - year}' for year in Xpred.columns[0:]]

        # rearrange columns
        Xpred =  Xpred[[f'{mt}_lag_{x}' for x in [7,6,5,4,3,2,1]]]

        # Use the trained model to make predictions
        Ypred = models_list[mt]['model'].predict(Xpred)
        mt_data_pivot[target_year] = Ypred

    predicted_list[mt] = mt_data_pivot 

# Save data as input for streamlit app

In [11]:
final_list = []
for mt, df in predicted_list.items():
    df['Category'] = mt
    final_list.append(df)
    
out_df = pd.concat(final_list, ignore_index=True)
out_df.to_csv('./data/maltreatment_predictions.csv', index=False)
