In [None]:
#Importing Packages

import os
import pandas as pd
from pandas.api.types import is_numeric_dtype
import string
import math
import numpy as np
import sklearn.linear_model
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import warnings
warnings.filterwarnings("ignore")x

In [None]:
#Retrieving all the datasets necessary

!wget https://ocrdata.ed.gov/assets/ocr/docs/2017-18-crdc-data.zip
!unzip 2017-18-crdc-data.zip
!wget https://ocrdata.ed.gov/assets/ocr/docs/2015-16-crdc-data.zip
!unzip 2015-16-crdc-data.zip
!wget https://ocrdata.ed.gov/assets/ocr/docs/2013-14-crdc-data.zip
!unzip 2013-14-crdc-data.zip
!wget https://ocrdata.ed.gov/assets/ocr/docs/2011-12-crdc-data.zip
!unzip 2011-12-crdc-data.zip

!wget https://www.lep.gov/sites/lep/files/media/document/2020-03/acs2013_lep_allstate.csv
!wget https://www.lep.gov/sites/lep/files/media/document/2020-03/acs2014_lep_allstate.csv
!wget https://www.lep.gov/sites/lep/files/media/document/2020-03/acs2012_lep_allstate.csv
!wget https://data.humdata.org/dataset/e9988552-74e4-4ff4-943f-c782ac8bca87/resource/868a2fdb-f5c8-4a98-af7c-cfc8bf0daeb3/download/us-counties-countries-fb-social-connectedness-index-october-2021.tsv


In [None]:
#Preproccessing 2015 CRDC Data

enrollment2015 = pd.read_csv("Data Files and Layouts/CRDC 2015-16 School Data.csv", encoding='windows-1252')

column_renamer = {}
structure = pd.read_csv('Data Files and Layouts/CRDC 2015-16 School Data Record Layout.csv', encoding='windows-1252')
for index in structure.index:
  column_renamer[structure.loc[index, 'Field_Name']] = structure.loc[index, 'Field_Description']
enrollment2015.rename(column_renamer, axis=1, inplace = True)

states = enrollment2015['District State Name'].unique()
state_totals2015 = pd.DataFrame()
for state in states:
  val = enrollment2015.loc[enrollment2015['District State Name'] == state]
  val = val.append(val.sum(numeric_only=True), ignore_index = True)
  row = val.iloc[-1:]
  row.rename(columns={"District State Name": "State"}, inplace = True)
  row.rename(index={(row.index[0]): int(np.where(states==state)[0][0])}, inplace = True)
  row["State"][row.index[0]] = val['District State Abbreviation'][0]
  row.dropna(axis=1, inplace = True)
  state_totals2015 = state_totals2015.append(row)

In [None]:
#Visualizing the 2015 CRDC data - the graph below shows the number of LEP students in each state, classified by ethnicity
#(Hispanic students were shown in a seperate graph due to the higher magnitude of the population counts)

from matplotlib.pyplot import figure
labels = state_totals2015['State']
plt.rcParams["figure.figsize"] = [12.00, 5]
alpha = 0.5
fig, ax = plt.subplots()

ax.bar(labels, state_totals2015['LEP Native American Students'].to_numpy(), width, label='LEP Native American Students', alpha = alpha)
ax.bar(labels, state_totals2015['LEP Asian Students'].to_numpy(), width, label='LEP Asian Students', alpha = alpha)
ax.bar(labels, state_totals2015['LEP Native Hawaiian Students'].to_numpy(), width, label='LEP Native Hawaiian Students', alpha = alpha)
ax.bar(labels, state_totals2015['LEP Black Students'].to_numpy(), width, label='LEP Black Students', alpha = alpha)
ax.bar(labels, state_totals2015['LEP White Students'].to_numpy(), width, label='LEP White Students' , alpha = alpha)

ax.set_ylabel('Number of Students')
ax.set_title('CRDC Counts of LEP Students by State (2015)')
ax.legend()

plt.xticks(rotation = 90)
plt.show()

alpha = 1
fig, ax = plt.subplots()
ax.bar(labels, state_totals2015['LEP Hispanic Students'].to_numpy(), width, label='LEP Hispanic Students', alpha = alpha)

ax.set_ylabel('Number of Students')
ax.set_title('CRDC Counts of LEP Students by State (2015)')
ax.legend()

plt.xticks(rotation = 90)
plt.show()

In [None]:
#Preproccessing 2014 & 2012  CRDC Data

enrollment2013 = pd.read_excel('2013-14 CRDC/School/CRDC-collected data file for Schools/03 Enrollment.xlsx')

state_codes = enrollment2013['LEA_STATE'].unique()
state_totals2013 = pd.DataFrame()
for state in state_codes:
  val = enrollment2013.loc[enrollment2013['LEA_STATE'] == state]
  val = val.append(val.sum(numeric_only=True), ignore_index = True)
  row = val.iloc[-1:]
  #row.rename(columns={"LEA_STATE": "State"}, inplace = True)
  row.rename(index={(row.index[0]): int(np.where(state_codes==state)[0][0])}, inplace = True)
  row.dropna(axis=1, inplace = True)
  state_totals2013 = state_totals2013.append(row)

enrollment2011 = pd.read_excel('2011-12 Public Use File/School/CRDC -collected data file for Schools/Pt 1-Enrollment/SCH_CRDC-collected data file for Schools_Pt 1-Enrollment_09-1 - Students who are Limited English Proficient.xlsx')

col_names = enrollment2011.columns
for i in range(7, 24):
    enrollment2011[col_names[i]] = enrollment2011[col_names[i]].replace(r'^\s*$', np.nan, regex=True)
    enrollment2011[col_names[i]] = enrollment2011[col_names[i]].replace(r'‡', np.nan, regex=True)
    enrollment2011[col_names[i]] = enrollment2011[col_names[i]].astype(float)
    #enrollment2011[col_names[i]] = pd.to_numeric(enrollment2011[col_names[i]], errors='ignore')

state_codes = enrollment2011['LEA_STATE'].unique()
state_totals2011 = pd.DataFrame()
for state in state_codes:
  val = enrollment2011.loc[enrollment2011['LEA_STATE'] == state]
  val = val.append(val.sum(numeric_only=True), ignore_index = True)
  row = val.iloc[-1:]
  row.rename(columns={"LEA_STATE": "State"}, inplace = True)
  row.rename(index={(row.index[0]): int(np.where(state_codes==state)[0][0])}, inplace = True)
  row.dropna(axis=1, inplace = True)
  state_totals2011 = state_totals2011.append(row)

state_totals2013['State'] = enrollment2013['LEA_STATE'].unique()
state_totals2011['State'] = enrollment2011['LEA_STATE'].unique()

st13 = pd.DataFrame()
st11 = pd.DataFrame()
for state in state_totals2017['State']:
  st13 = st13.append(state_totals2013.loc[state_totals2013['State'] == state])
  st11 = st11.append(state_totals2011.loc[state_totals2011['State'] == state])
st13 = st13.reset_index()
st11 = st11.reset_index()
st13.drop(columns = ['index'], inplace = True)
st11.drop(columns = ['index'], inplace = True)

In [None]:
lep2015 = pd.read_csv("acs2013_lep_allstate.csv", encoding='windows-1252')
lep2014 = pd.read_csv("acs2014_lep_allstate.csv", encoding='windows-1252')
lep2012 = pd.read_csv("acs2012_lep_allstate.csv", encoding='windows-1252')

def rename(lep):
  lep['LEP White'] = lep['French (incl. Patois, Cajun)'] + lep['French Creole'] + lep['Italian'] + lep['Portuguese/Portuguese Creole'] + lep['German'] + lep['Yiddish'] + lep['Other West Germanic langs.'] + lep['Greek'] + lep['Russian'] + lep['Polish'] + lep['Serbo-Croatian'] + lep['Other Slavic langs.'] + lep['Armenian'] + lep['Hungarian'] + lep['Other Indo-European langs.']
  lep['LEP Hispanic'] = lep['Spanish or Spanish Creole']
  lep['LEP Native American'] = lep['Navajo'] + lep['Other Native North American']
  lep['LEP Asian'] = lep['Gujarati'] + lep['Hindi'] + lep['Urdu'] + lep['Other Indic langs.'] + lep['Chinese'] + lep['Japanese'] + lep['Korean'] + lep['Mon-Khmer(Cambodian)'] + lep['Hmong'] + lep['Thai'] + lep['Laotian'] + lep['Vietnamese'] + lep['Other Asian langs.'] + lep['Arabic']# + lep['Hebrew']
  lep['LEP Native Hawaiian'] = lep['Tagalog'] + lep['Other Pacific Island langs.']
  lep['LEP Black'] = lep['African langs.'] 

rename(lep2015)
rename(lep2014)
rename(lep2012)

lep2012.drop(labels=[49], axis = 0, inplace = True)
lep2012 = lep2012.sort_values('Location')
col_renamer = {}
ogindex = lep2012.index
for i in range(0, 51):
  col_renamer[ogindex[i]] = i
lep2012.rename(col_renamer, axis=0, inplace = True)

lep2014.drop(labels=[51], axis = 0, inplace = True)
lep2015.drop(labels=[51], axis = 0, inplace = True)

In [None]:
def standardize15(df):
  df['LEP Hispanic Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Hispanic Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Hispanic Female"])
  df['LEP Native American Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): American Indian/Alaska Native Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): American Indian/Alaska Native Female"])
  df['LEP Asian Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Asian Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Asian Female"])
  df['LEP Native Hawaiian Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Native Hawaiian/Pacific Islander Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Native Hawaiian/Pacific Islander Female"])
  df['LEP Black Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Black Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Black Female"])
  df['LEP White Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): White Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): White Female"])
  df['Percent LEP Hispanic Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Hispanic Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Hispanic Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])
  df['Percent LEP Native American Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): American Indian/Alaska Native Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): American Indian/Alaska Native Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])
  df['Percent LEP Asian Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Asian Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Asian Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])
  df['Percent LEP Native Hawaiian Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Native Hawaiian/Pacific Islander Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Native Hawaiian/Pacific Islander Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])
  df['Percent LEP Black Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): Black Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): Black Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])
  df['Percent LEP White Students'] = (df["Enrollment of Students who are Limited English Proficient (LEP): White Male"] + df["Enrollment of Students who are Limited English Proficient (LEP): White Female"])/(df['Overall Student Enrollment: Calculated Male Total'] + df['Overall Student Enrollment: Calculated Female Total'])

standardize(state_totals2015)

def standardize14(df):
  df['LEP Hispanic Students'] = df['SCH_LEPENR_HI_M'] + df['SCH_LEPENR_HI_F']
  df['LEP Native American Students'] = df['SCH_LEPENR_AM_M'] + df['SCH_LEPENR_AM_F']
  df['LEP Asian Students'] = df['SCH_LEPENR_AS_M'] + df['SCH_LEPENR_AS_F']
  df['LEP Native Hawaiian Students'] = df['SCH_LEPENR_HP_M'] + df['SCH_LEPENR_HP_F']
  df['LEP Black Students'] = df['SCH_LEPENR_BL_M'] + df['SCH_LEPENR_BL_F']
  df['LEP White Students'] = df['SCH_LEPENR_BL_M'] + df['SCH_LEPENR_BL_F']
  df['Percent LEP White Students'] = df['LEP White Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100
  df['Percent LEP Hispanic Students'] = df['LEP Hispanic Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100
  df['Percent LEP Native American Students'] = df['LEP Native American Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100
  df['Percent LEP Asian Students'] = df['LEP Asian Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100
  df['Percent LEP Native Hawaiian Students'] = df['LEP Native Hawaiian Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100
  df['Percent LEP Black Students'] = df['LEP Black Students']/(df['TOT_LEPPROGENR_M'] + df['TOT_LEPPROGENR_F']) * 100

standardize14(st13)

def standardize12(df):
  df['LEP Hispanic Students'] = df['M_HIS_7_LEP'] + df['F_HIS_7_LEP']
  df['LEP Native American Students'] = df['M_AME_7_LEP'] + df['F_AME_7_LEP']
  df['LEP Asian Students'] = df['M_ASI_7_LEP'] + df['F_ASI_7_LEP']
  df['LEP Native Hawaiian Students'] = df['M_HI_PAC_7_LEP'] + df['F_HI_PAC_7_LEP']
  df['LEP Black Students'] = df['M_BLA_7_LEP'] + df['F_BLA_7_LEP']
  df['LEP White Students'] = df['M_WHI_7_LEP'] + df['F_WHI_7_LEP']
  df['Percent LEP White Students'] = df['LEP White Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  df['Percent LEP Hispanic Students'] = df['LEP Hispanic Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  df['Percent LEP Native American Students'] = df['LEP Native American Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  df['Percent LEP Asian Students'] = df['LEP Asian Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  df['Percent LEP Native Hawaiian Students'] = df['LEP Native Hawaiian Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  df['Percent LEP Black Students'] = df['LEP Black Students']/(df['M_TOT_7_LEP'] + df['F_TOT_7_LEP']) * 100
  
standardize12(st11)

In [None]:
columns = ['Years Since 2010', 'State', 'LEP Hispanic', 'LEP White', 'LEP Native American', 'LEP Asian', 'LEP Native Hawaiian', 'LEP Black', 'LEP Black Students', 'LEP Hispanic Students', 'LEP White Students', 'LEP Native American Students', 'LEP Asian Students', 'LEP Native Hawaiian Students']

In [None]:
y12 = pd.DataFrame()
y14 = pd.DataFrame()
y15 = pd.DataFrame()

def add_cols(df, columns):
  for col in columns:
    df[col] = np.NaN

add_cols(y12, columns)
add_cols(y14, columns)
add_cols(y15, columns)

y12['State'] = state_totals2015['State']
y14['State'] = state_totals2015['State']
y15['State'] = state_totals2015['State']
"""
y12['Years Since 2010'] = 2
y14['Years Since 2010'] = 4
y15['Years Since 2010'] = 5
"""
CRDC_Metrics = ['LEP Hispanic Students', 'LEP White Students', 'LEP Native American Students', 'LEP Asian Students', 'LEP Black Students', 'LEP Native Hawaiian Students', 'Percent LEP Hispanic Students', 'Percent LEP White Students', 'Percent LEP Native American Students', 'Percent LEP Asian Students', 'Percent LEP Native Hawaiian Students']
Survey_Metrics = ['LEP Hispanic', 'LEP White', 'LEP Native American', 'LEP Asian', 'LEP Black', 'LEP Native Hawaiian', 'Percent LEP Hispanic', 'Percent LEP White', 'Percent LEP Native American', 'Percent LEP Asian', 'Percent LEP Native Hawaiian']

for col in Survey_Metrics:
  y12[col] = lep2012[col]
  y14[col] = lep2014[col]
  y15[col] = lep2015[col]

for col in CRDC_Metrics:
  y12[col] = st11[col]
  y14[col] = st13[col]
  y15[col] = state_totals2015[col]

"""
y12 = y12.append(y14, ignore_index = True)
y12 = y12.append(y15, ignore_index = True)
data = y12
"""

df = y12.merge(y14, on = 'State')
df.drop(columns = ['Years Since 2010_x', 'Years Since 2010_y'], inplace = True)

df_output = y15
df_output.drop(columns = ['Years Since 2010'], inplace = True)

df = df.reset_index()
df_output = df_output.reset_index()
df_output.drop(columns = ['index'], inplace = True)
df.drop(columns = ['index'], inplace = True)
df.drop(columns = ['State'], inplace = True)
df_output.drop(columns = ['State'], inplace = True)

In [None]:
def r_square(pred, output, index):
  ss_res = np.sum((pred - output.iloc[index])**2)
  ss_total = np.sum((pred - output.mean())**2)
  return 1 - ss_res/ss_total

def r_square_per_feature(pred, output, index):
  out = output.iloc[index].to_numpy()
  ss_res = (pred - out)**2
  ss_total = (pred - output.mean(axis = 0))**2
  return(1-(ss_res/ss_total), (pred - out))  

CRDC_Metrics = ['LEP Hispanic Students', 'LEP White Students', 'LEP Native American Students', 'LEP Asian Students', 'LEP Native Hawaiian Students', 'LEP Black Students']
Survey_Metrics = ['LEP Hispanic', 'LEP White', 'LEP Native American', 'LEP Asian', 'LEP Native Hawaiian', 'LEP Black']

CRDC_input = ['LEP Hispanic Students_x', 'LEP White Students_x', 'LEP Native American Students_x', 'LEP Asian Students_x', 'LEP Native Hawaiian Students_x', 'LEP Black Students_x', 'LEP Hispanic Students_y', 'LEP White Students_y', 'LEP Native American Students_y', 'LEP Asian Students_y', 'LEP Native Hawaiian Students_y', 'LEP Black Students_y']
Survey_input = ['LEP Hispanic_x', 'LEP White_x', 'LEP Native American_x', 'LEP Asian_x', 'LEP Native Hawaiian_x', 'LEP Black_x', 'LEP Hispanic_y', 'LEP White_y', 'LEP Native American_y', 'LEP Asian_y', 'LEP Native Hawaiian_y', 'LEP Black_y']

def add_polynomial_features(df, degree):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    poly_features = poly.fit_transform(df)
    poly_features = pd.DataFrame(poly_features, columns=poly.get_feature_names([df.columns[0]]))
    return poly_features

def renamer(squo, new):
  renamer_index = {}
  for i in range(len(squo)):
    renamer_index[squo[i]] = new[i]
  return(renamer_index)

CRDC_df = df[CRDC_input]
CRDC_output = df_output[CRDC_Metrics]

Census_df = df[Survey_input]
Census_output = df_output[Survey_Metrics]

CRDC_df.rename(columns = renamer(CRDC_input, Survey_input), inplace = True)
CRDC_output.rename(columns = renamer(CRDC_Metrics, Survey_Metrics), inplace = True)


print("\n---------------------Building CRDC Model---------------------")
performance = 0 
for index in CRDC_df.index:
  #print(state_totals2017.loc[index, 'State'])
  k_in = CRDC_df.loc[CRDC_df.index != index]
  k_out = CRDC_output.loc[CRDC_output.index != index]
  model.fit(k_in, k_out)
  k_pred = model.predict([CRDC_df.iloc[index]])
  r_squared = r_square(k_pred[0], CRDC_output, index)
  performance += r_squared
  r_squa, direction = r_square_per_feature(k_pred[0], CRDC_output, index)
  print(state_totals2017.loc[index, 'State'])
  print(index)
  print("Overall: " + str(r_squared))
  print("Features: ")
  print(r_squa)
  print("\n* * * * * * * * * * * * * * * * *\n")
print("\n\nCulmulative Performance: " + str(performance/51))

print("\n---------------------Building Census Model---------------------")
performance = 0 
for index in Census_df.index:
  #print(state_totals2017.loc[index, 'State'])
  k_in = Census_df.loc[Census_df.index != index]
  k_out = Census_output.loc[Census_output.index != index]
  model.fit(k_in, k_out)
  k_pred = model.predict([Census_df.iloc[index]])
  r_squared = r_square(k_pred[0], Census_output, index)
  performance += r_squared
  r_squa, direction = r_square_per_feature(k_pred[0], Census_output, index)
  print(state_totals2017.loc[index, 'State'])
  print(index)
  print("Overall: " + str(r_squared))
  print("Features: ")
  print(r_squa)
  print("\n* * * * * * * * * * * * * * * * *\n")
print("\n\n\nCulmulative Performance: " + str(performance/51))

In [None]:
crdc_modified = CRDC_df.loc[CRDC_df.index != 2]
crdc_output_m = CRDC_output.loc[CRDC_output.index != 2]
performance = 0 
#for index in crdc_modified.index:
for index in range(0, 50):
  #print(state_totals2017.loc[index, 'State'])
  k_in = crdc_modified.loc[crdc_modified.index != index]
  k_out = crdc_output_m.loc[crdc_output_m.index != index]
  model.fit(k_in, k_out)
  k_pred = model.predict([crdc_modified.iloc[index]])
  r_squared = r_square(k_pred[0], crdc_output_m, index)
  performance += r_squared
  r_squa, direction = r_square_per_feature(k_pred[0], crdc_output_m, index)
  print(state_totals2017.loc[index, 'State'])
  print(index)
  print("Overall: " + str(r_squared))
  print("Features: ")
  print(r_squa)
  print("\n* * * * * * * * * * * * * * * * *\n")
print("\n\nCulmulative Performance: " + str(performance/50)+ "\n\n\n")

In [None]:
CRDC_Mod = model.fit(crdc_modified, crdc_output_m)
estimation = CRDC_Mod.predict(Census_df)
predictions = (Census_output - estimation)
predictions.sum(axis = 0)

In [None]:
Census_output.sum(axis = 0)

In [None]:
CRDC_Mod = model.fit(crdc_modified, crdc_output_m)
estimation = CRDC_Mod.predict(Census_df)
estimation.sum(axis = 0)

In [None]:
figure(figsize=(10, 10), dpi=80)

metrics = ['LEP Hispanic', 'LEP White', 'LEP Native American', 'LEP Asian', 'LEP Native Hawaiian', 'LEP Black']
width = 1      
alpha = 0.70
fig, ax = plt.subplots()
ax.bar(metrics, estimation.sum(axis = 0), width, label = 'Predicted Value', alpha = alpha)
ax.bar(metrics, Census_output.sum(axis= 0), width, label = 'Census Data', alpha = alpha)
ax.set_ylabel('Number of People')
ax.set_title('National LEP Populations: CRDC Model Predictions vs. Census Bureau Surveyed')
ax.legend()
plt.xticks(rotation = 60)
plt.show()

In [None]:
import math
"""
def print_state_data(sus_states, net_impact):
  keys = sus_states.keys()
  print("\n* * * * * * States that Lost Reps * * * * * *")
  for state in lost_reps_states:
    if state in keys:
      print("\n" + state)
      print(sus_states[state])
      print("Net Impact: " + str(net_impact[state]))
  print("\n* * * * * * States that Gained Reps * * * * * *")
  for state in gained_reps_states:
    if state in keys:
      print("\n" + state)
      print(sus_states[state])
      print("Net Impact: " + str(net_impact[state]))
  print("\n* * * * * * Neither * * * * * *")
  for state in keys:
    if state not in lost_reps_states and state not in gained_reps_states:
      print("\n" + state)
      print(sus_states[state])
      print("Net Impact: " + str(net_impact[state]))
"""
def print_state_data(sus_states, sus_states_two, net_impact):
  keys = sus_states.keys()
  for state in keys:
    print("\n" + state)
    metrics = sus_states[state].keys()
    for metric in metrics:
      if metric in sus_states_two[state].keys():
        if ('Overcounted' in sus_states[state][metric] and 'Overcounted' in sus_states_two[state][metric]) or ('Undercounted' in sus_states[state][metric] and 'Undercounted' in sus_states_two[state][metric]):
          print(str(metric) + " : " + str(sus_states[state][metric]))
    #print("Net Impact: " + str(net_impact[state]))

#print("\n---------------------Predicting Census Data Using CRDC Model---------------------")
CRDC_Model = model.fit(CRDC_df, CRDC_output)
estimation = CRDC_Model.predict(Census_df)
#print(estimation)
predictions = (Census_output - estimation)/(estimation)
net = (Census_output - estimation).sum(axis = 1)
error_threshold = .1
state_errors = {}
impacts = {}
for index in range(len(predictions)):
  discrpencies = {}
  for col in predictions.columns:
    val = predictions.loc[index, col]
    if math.sqrt(float(val)**2) > error_threshold:
      if val < 0:
        discrpencies[col] = ('Undercounted by ' + str(val * 100) + '%')
      else:
        discrpencies[col] = ('Overcounted by ' + str(val * 100) + '%')
  #if len(discrpencies) !=  0:
  state_errors[state_totals2017.loc[index, 'State']] = discrpencies
  impacts[state_totals2017.loc[index, 'State']] = net[index]
#print_state_data(state_errors, impacts)

print("\n---------------------Predicting CRDC Data Using Census Model---------------------")
Census_Model = model.fit(Census_df, Census_output)
estimation = Census_Model.predict(CRDC_df)
predictions = (estimation - CRDC_output)/(CRDC_output)
net = (CRDC_output - estimation).sum(axis = 1)
error_threshold = .1
census_state_errors = {}
#impacts = {}
impacts  = {'LEP Hispanic':[], 'LEP White':[], 'LEP Black':[], 'LEP Native American':[], 'LEP Native Hawaiian':[], 'LEP Asian':[]}
for index in range(len(predictions)):
  discrpencies = {}
  for col in predictions.columns:
    val = predictions.loc[index, col]
    if math.sqrt(float(val)**2) > error_threshold:
      if val < 0:
        discrpencies[col] = ('Undercounted by ' + str(val * 100) + '%')
      else:
        discrpencies[col] = ('Overcounted by ' + str(val * 100) + '%')
  #if len(discrpencies) !=  0:
  census_state_errors[state_totals2017.loc[index, 'State']] = discrpencies
  impacts[state_totals2017.loc[index, 'State']] = net[index]
print_state_data(state_errors, census_state_errors, impacts)

In [None]:
import math
keys = {'LEP Hispanic':0, 'LEP White':1, 'LEP Native American':2, 'LEP Asian':3, 'LEP Native Hawaiian':4, 'LEP Black':5}
CRDC_Model = model.fit(CRDC_df, CRDC_output)
cr_estimations = CRDC_Model.predict(Census_df)
cr_errors = (Census_output - cr_estimations)
cr_predictions = (Census_output - cr_estimations)/(cr_estimations)
Census_Model = model.fit(Census_df, Census_output)
census_estimations = Census_Model.predict(CRDC_df)
census_predictions = CRDC_output - census_estimations 
error_threshold = .1
net_impact = {'LEP Hispanic':0, 'LEP White':0, 'LEP Native American':0, 'LEP Asian':0, 'LEP Native Hawaiian':0, 'LEP Black':0}
net_impact2 = {'LEP Hispanic':0, 'LEP White':0, 'LEP Native American':0, 'LEP Asian':0, 'LEP Native Hawaiian':0, 'LEP Black':0}
net_impact3 = {'LEP Hispanic':0, 'LEP White':0, 'LEP Native American':0, 'LEP Asian':0, 'LEP Native Hawaiian':0, 'LEP Black':0}
impacts  = {'LEP Hispanic':[], 'LEP White':[], 'LEP Black':[], 'LEP Native American':[], 'LEP Native Hawaiian':[], 'LEP Asian':[]}
for index in range(len(cr_predictions)):
  for col in cr_predictions.columns:
    cr_val = cr_predictions.loc[index, col]
    census_val = census_predictions.loc[index, col]
    if (cr_estimations[index][keys[col]] > 0): #and (math.sqrt(float(cr_val)**2) > error_threshold):
      net_impact3[col] += cr_errors.loc[index, col]
      if (math.sqrt(float(cr_val)**2) > error_threshold):
        net_impact2[col] += cr_errors.loc[index, col]
        if ((cr_val < 0 and census_val < 0) or (cr_val > 0 and census_val > 0)) and cr_val < 5:
          #print("\n**************************\n")
          #print(cr_val)
          #print(census_val)
          #print(cr_errors.loc[index, col])
          #impacts[col].append(cr_val*100)
          impacts[col].append(cr_errors.loc[index, col])
          net_impact[col] += cr_errors.loc[index, col]
        else:
          impacts[col].append(0)
      else:
        impacts[col].append(0)
    else:
      impacts[col].append(0)
print("\n------------------- Raw Predictions -------------------\n")
print(net_impact3)
print("\n------------------- With Error Threshold -------------------\n")
print(net_impact2)
print("\n------------------- Error Threshold & Cross Checked -------------------\n")
print(net_impact)

In [None]:
labels = state_totals2015['State']
plt.rcParams["figure.figsize"] = [12.00, 5]
#width = 1
alpha = 0.5
fig, ax = plt.subplots()

#ax.bar(labels, LEP_Hisp, width, label='LEP Hispanic')
ax.bar(labels, impacts['LEP Native American'], width, label='LEP Native American', alpha = alpha)
ax.bar(labels, impacts['LEP Asian'], width, label='LEP Asian', alpha = alpha)
ax.bar(labels, impacts['LEP Native Hawaiian'], width, label='LEP Native Hawaiian', alpha = alpha)
ax.bar(labels, impacts['LEP Black'], width, label='LEP Black', alpha = alpha)
ax.bar(labels, impacts['LEP White'], width, label='LEP White' , alpha = alpha)
ax.bar(labels, impacts['LEP Hispanic'], width, label='LEP Hispanic' , alpha = alpha)
#ax.bar(labels, state_totals2015['LEP Hispanic Students'].to_numpy(), width, label='LEP Hispanic Students' , alpha = alpha)

ax.set_ylabel('Nominal Difference')
ax.set_title('Nominal Over/Undercounts of LEP Ethnic Groups by State')
ax.legend()

plt.xticks(rotation = 90)
plt.show()