In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import researchpy as rp
import pandas as pd
import numpy as np

In [2]:
data_folder = Path('../data/')

fn = Path(data_folder, 'assessments.csv')
df = pd.read_csv(fn)

individuals = df[df['AssessmentName'].str.contains('Individuals')].copy()

# Drop any completely empty columns
# e.g. columns that are youth survey-specific and not used for adults
individuals.dropna(axis=1, how='all', inplace=True)

#Convert subscores to an int for math purposes.
for col in [c for c in individuals.columns if c.startswith('CALC_')]:
    individuals[col] = individuals[col].apply(int)

youth = df[df['AssessmentName'].str.contains('Youth')].copy()

# Drop any completely empty columns
# e.g. columns that are adult survey-specific and not used for youth
youth.dropna(axis=1, how='all', inplace=True)

# Convert subscores to an int for math purposes.
for col in [c for c in youth.columns if c.startswith('CALC_')]:
    youth[col] = youth[col].apply(int)

## Analysis: Chi-square tests

**NOTE:** For each assessment type and subset of the data, do two chi-square tests: one that compares Black and White people, and another that compares Latino and White people. Then, compare the results of similar demographic/assessment combinations and see if they are both significant or not significant.

In [3]:
chidata = []
subscore_charts = dict()

for calc_type in ['ALL','WITH EXCLUSIONS']:
    for group in ['Black', 'Latino']:
        for assessment in df['AssessmentName'].unique(): 
            af = df[df['AssessmentName'].eq(assessment)].copy()
            if calc_type == 'WITH EXCLUSIONS':
                af = af[af['TOTAL_SCORE'].eq(af['CALC_TOTAL_SCORE'])].copy()
            af = af.dropna(how='all', axis=1).copy()
            
            subscores = [c for c in af.columns if c.startswith('CALC_') and not c.endswith('_SCORE')]
            subscores.append('GENERAL_SUB_SCORE')
            
            for s in subscores:
                af['ChiRE'] = df['Race/Ethnicity'].apply(lambda r: r if r in ['White', group] else np.nan)
                af['ChiScore'] = df[s]
                        
                crosstab, test_results, expected = rp.crosstab(af['ChiRE'], af['ChiScore'],
                                                           test= "chi-square",
                                                           expected_freqs= True,
                                                           prop= "cell")        
                row = [s, assessment, calc_type, group]
                row.extend(test_results['results'])
                chidata.append(row)
                
chidf = pd.DataFrame(chidata, columns=['score','assessment','subset', 'group','pearson chi-square','p-value','cramers phi'])            
chidf['sig'] = chidf['p-value'].lt(0.05)

# Pivot to compare like All/With Exclusions subsets
pivoted = chidf[['score','assessment','group','subset','sig']].pivot_table(
    values='sig', index=['score','assessment', 'group'], columns='subset'
).copy()

pivoted['ALL'].eq(pivoted['WITH EXCLUSIONS']).value_counts()

True    68
dtype: int64

## Relative Comparisons

**NOTE:** Generate subscore charts (as seen in the methodology) for each assessment and subset of the data. Then, check and see if each chart has the same relative comparisons to each other.

In [4]:
subscore_charts = dict()

for group in ['ALL','WITH EXCLUSIONS']:
    for assessment in df['AssessmentName'].unique(): 
        af = df[df['AssessmentName'].eq(assessment)].copy()
        if group == 'WITH EXCLUSIONS':
            af = af[af['TOTAL_SCORE'].eq(af['CALC_TOTAL_SCORE'])].copy()
        af = af.dropna(how='all', axis=1).copy()
        
        subscores = [c for c in af.columns if c.startswith('CALC_') and not c.endswith('_SCORE')]
        subscores.append('GENERAL_SUB_SCORE')
        
        for subscore in subscores:
            af[subscore] = af[subscore].apply(int)
        
        rates = af[
            af['Race/Ethnicity'].isin(['Black','Latino', 'White'])
        ].groupby('Race/Ethnicity')[subscores].mean().T
        
        if not assessment in subscore_charts.keys():
            subscore_charts[assessment] = {group: rates.mul(100).round(1).sort_values(by='Black').copy()}
        else:
            subscore_charts[assessment][group] = rates.mul(100).round(1).sort_values(by='Black').copy()

In [5]:
comparisons = []
for assessment in subscore_charts.keys():
    for group in ['Black', 'Latino']:
        tk = pd.DataFrame(
            subscore_charts[assessment]['ALL'][group].le(
                subscore_charts[assessment]['ALL']['White']).eq(
            subscore_charts[assessment]['WITH EXCLUSIONS'][group].le(
                subscore_charts[assessment]['WITH EXCLUSIONS']['White'])
            ), columns=["Same Comparative Relationship"])
        tk['group'] = group
        tk['assessment'] = assessment
        comparisons.append(tk.copy())

all_comparisons = pd.concat(comparisons)

all_comparisons['Same Comparative Relationship'].value_counts()

True    68
Name: Same Comparative Relationship, dtype: int64