In [214]:
import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer




In [215]:
# read in data 
dat = pd.read_excel('./maltreatment_data.xlsx')

# only retain numbers (not percentages)
dat = dat[dat['DataFormat'] == 'Number']

# convert cases reported to a numeric value
dat['Data'] = pd.to_numeric(dat['Data'], errors='coerce')

# drop NA values
dat = dat.dropna()

# drop  
dat = dat[dat['Category'] != 'Other/missing maltreatment type']



In [216]:
dat['Location'].unique()

array(['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

In [217]:
# list of maltreatment types
mt_list = dat['Category'].unique()

# Build a model for Alabama (test case)
all_predictions = pd.DataFrame()

for st in dat['Location'].unique():
    print(st)
    # subset to specific state
    state_data = dat[dat['Location'] == st]

    # only retain cols required for the model
    state_data = state_data[['Category', 'TimeFrame', 'Data']]


    # rename columns
    state_data = state_data.rename(columns = {'TimeFrame' : 'Year', 'Data' : 'Cases'})

    # Pivot the DataFrame to get features for the model
    state_features = state_data.pivot_table(index='Year', columns='Category', values='Cases', aggfunc='sum').reset_index()

    
    
    # handling missing values within a maltreatment type 
    # if there are more than 3 missing values drop the feature 
    # else impute the values
    for mt in state_data['Category'].unique():
            na_count = state_features[state_features[mt].isna()].shape[0]
            if(na_count>0):
                if(na_count <= 3):
                    imputer = SimpleImputer(strategy='mean')
                    state_features[mt] = imputer.fit_transform(state_features[[mt]])
                else:
                    state_features.drop(mt, axis=1, inplace=True)
                    
    # sort to keep in ascending oreder of years
    state_features = state_features.sort_values(by='Year')
    
    # Compute aggregate number of cases based on year
    state_out = state_data.groupby('Year')['Cases'].sum().reset_index()

    # Create a new column to indicate whether maltreatment cases increased or decreased compared to the previous year
    state_out['Increase'] = (state_out['Cases'].diff() > 0).astype(int)

    # sort to keep in ascending oreder of years
    state_out = state_out.sort_values(by='Year')
    
    if (state_out['Increase'].unique().size < 2):
        print('Skipping state due to single class: '+ st)
    
    else: 
    # feature list for the model
    # Select features and target variable
        feature_list = state_features.columns.to_list()
        feature_list.remove('Year') 

        X = state_features.loc[:, feature_list]
        Y = state_out['Increase']

        # Initialize and train the logistic regression model
        model = LogisticRegression()
        model.fit(X, Y)

        # Predict the probability of increase or decrease in maltreatment cases for future years
        future_years = 5  # Predict for the next 5 years, for example
        latest_year_data =state_features.tail(1).loc[:, feature_list]
        predictions = []

        for i in range(1, future_years + 1):
            next_year_data = latest_year_data.copy()
            next_year_data += np.random.randint(-10, 10, size=(1, len(feature_list)))  # Adding some random noise to simulate changes over time
            prediction = model.predict_proba(next_year_data)
            predictions.append({ 'Location' : st, 
                                'Year': state_features.tail(1)['Year'].iloc[0] + i,
                                'Probability_Increase': prediction[0][1],
                                'Probability_Decrease': prediction[0][0]})

        # Convert predictions to DataFrame
        predictions_df = pd.DataFrame(predictions)
        all_predictions = pd.concat([all_predictions, predictions_df])


United States
Alabama
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
District of Columbia
Florida
Skipping state due to single class: Florida
Georgia


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
Skipping state due to single class: New Jersey
New Mexico
New York
North Carolina
North Dakota


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Ohio
Oklahoma
Oregon
Pennsylvania
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
Utah
Vermont


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Virginia
Washington
West Virginia
Wisconsin
Wyoming
Puerto Rico


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [218]:
all_predictions

Unnamed: 0,Location,Year,Probability_Increase,Probability_Decrease
0,United States,2023,1.882357e-07,1.000000
1,United States,2024,1.941821e-07,1.000000
2,United States,2025,1.715920e-07,1.000000
3,United States,2026,1.627438e-07,1.000000
4,United States,2027,1.592222e-07,1.000000
...,...,...,...,...
0,Puerto Rico,2023,9.291667e-03,0.990708
1,Puerto Rico,2024,1.988823e-02,0.980112
2,Puerto Rico,2025,4.633877e-03,0.995366
3,Puerto Rico,2026,5.327428e-04,0.999467


In [219]:
all_predictions.to_csv('./state_predictions.csv', index=False)
