In [15]:
# Importing modules
## helpful packages
import pandas as pd
import numpy as np
import random
import re
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

## nltk imports
import nltk
### uncomment and run these lines if you haven't downloaded relevant nltk add-ons yet
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
from nltk import pos_tag
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## spacy imports
import spacy
### uncomment and run the below line if you haven't loaded the en_core_web_sm library yet
#! python -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

## vectorizer
from sklearn.feature_extraction.text import CountVectorizer

## sentiment
#!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## lda
from gensim import corpora
import gensim

# matplotlib
import matplotlib.pyplot as plt

## repeated printouts and wide-format text
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

In [2]:
# Reading the datasets

demo_df = pd.read_excel(r"../files/Dartmouth_Data_Set.xlsx")
FEIS_df = pd.read_excel(r"../files/START_FEIS_Data.xlsx")
time_df = pd.read_excel(r"../files/Dartmouth_Time_Data.xlsx")
dict_df = pd.read_excel(r"../files/Final SIRS_Data_Dictionary_V13.1 October 2020.xlsx")

In [3]:
# Cleaning the demographics dataset

demographics = demo_df[['Local ID', 'Region', 'Date Enrolled in START', 'Gender', 'Race', 'Date of birth', 'Ethnicity',
                              'Level of Intellectual Disability', 'Psychiatric diagnoses', 'Medical diagnoses', 'Other Disabilities',
                              'Funding']]

In [4]:
# Merging datasets (FEIS and demographics)
merged = pd.merge(demographics, FEIS_df, how = 'inner', left_on = ['Local ID'], 
                  right_on = ['Respondent ID #  (SIRS Local ID)'])
merged_short_answer = merged[['Gender', 'Race', 'Local ID',
                              'What\nadvice would you give to service planners regarding the mental health service\nneeds of persons with IDD and their families?', 
                              "Was there any particular service that your\nfamily member needed that was not available?", 
                              "If yes, please describe the service."]]

merged_short_answer.columns = ['Gender', 'Race', 'ID', 'Advice', 'Missing Service', 'Service Needed']
merged_short_answer




Unnamed: 0,Gender,Race,ID,Advice,Missing Service,Service Needed
0,Male,Other: Mexican,8008815,,No,
1,Female,"Unknown, not collected",6570649,"â€œPlease be aware of her conditions and diagnosis, so many professionals are unfamiliar with the medical history of Citlalli. It is discouraging when professionals do not know Citlalli, but make recommendations for her. Also, it is discouraging when the professionals do not take the opinions of the family seriously.â€",Yes,A counselor was not and has not been made available for the last six months.
2,Female,White,434021,,Yes,In-home behavior support
3,Male,White,6580618,Declined to answer/did not know.,Yes,"""After Trevorâ€™s psychiatrist left the office, the office also stopped taking his insurance and as a result, Trevor went without a psychiatrist for a while. Trevorâ€™s family tried their best to get him in with other psychiatrists, but struggled to find one that would treat Trevor. Through SARC, Trevor was referred to Hope Services and will begin seeing a psychiatrist there on 1.27.21."""
4,Male,"Unknown, not collected",354280,"Listen to the parents, take what parents report seriously, and provide tips, not just call the cops, have options/walk parent through it.",Yes,"At home off hour support on phone or in person/respite, have removed for the night for safety reasons."
...,...,...,...,...,...,...
1092,Male,Black or African American,1013197,,No,
1093,Female,White,1100502,,No,
1094,Female,Black or African American,1132230,,Yes,Wraparound services and continuily of care
1095,Male,White,11128011,,No,


In [None]:
# Counting how many entries of the dataframe did not report a race

print("Number of entries where patient's race is unknown: {}".format(merged_short_answer.Race.str.contains('Unknown, not collected').sum()))

# Subsetting by gender
demographics_male = merged_short_answer.loc[merged_short_answer['Gender']=='Male']
demographics_female = merged_short_answer.loc[merged_short_answer['Gender']=='Female']
print("Number of males overall in the dataset: {}".format(demographics_male.shape[0]))
print("Number of females overall in the dataset: {}".format(demographics_female.shape[0]))
# Subsetting by race
male_white = demographics_male[demographics_male['Race'] == "White"]
male_nonwhite = demographics_male[demographics_male['Race'] != "White"]

female_white = demographics_female[demographics_female['Race'] == "White"]
female_nonwhite = demographics_female[demographics_female['Race'] != "White"]

print("Number of white males in the dataset: {}".format(male_white.shape[0]))
print("Number of non-white males in the dataset: {}".format(male_nonwhite.shape[0]))
print("Number of white females in the dataset: {}".format(female_white.shape[0]))
print("Number of non-white females in the dataset: {}".format(female_nonwhite.shape[0]))

#Investigate missingness and incidence of null values in Advice column 

print("Number of respondents who filled out Advice column: {}".format(merged_short_answer.Advice.notna().sum()))

perc_responses_white_males = (male_white.Advice.notna().sum() / male_white.shape[0]) * 100
perc_responses_nonwhite_males = (male_nonwhite.Advice.notna().sum() / male_nonwhite.shape[0]) * 100
perc_responses_white_females= (female_white.Advice.notna().sum()  / female_white.shape[0]) * 100
perc_responses_nonwhite_females = (female_nonwhite.Advice.notna().sum() / female_nonwhite.shape[0]) * 100

print("Percentage of white male respondents who filled out Advice column: {}".format(perc_responses_white_males))
print("Percentage of nonwhite male respondents who filled out Advice column: {}".format(perc_responses_nonwhite_males))
print("Percentage of white female respondents who filled out Advice column: {}".format(perc_responses_white_females))
print("Percentage of nonwhite female respondents who filled out Advice column: {}".format(perc_responses_nonwhite_females))

In [5]:
# Subsetting by gender
demographics_male = merged_short_answer.loc[merged_short_answer['Gender']=='Male']
demographics_female = merged_short_answer.loc[merged_short_answer['Gender']=='Female']

# Subsetting by race
male_white = demographics_male[demographics_male['Race'] == "White"]
male_nonwhite = demographics_male[demographics_male['Race'] != "White"]

female_white = demographics_female[demographics_female['Race'] == "White"]
female_nonwhite = demographics_female[demographics_female['Race'] != "White"]


In [6]:
# Stopwords list (with custom)

custom_stopwords = ["need", "servic", "famili", "provid", "care", "would"]

stop_words = stopwords.words('english')
stop_words_new = stop_words + custom_stopwords
#stop_words_new

snowball = SnowballStemmer(language="english")

# Preprocessing the advice column
def process(string):
    string_lower = string.lower()
    #string_lower
    tokens = word_tokenize(string_lower)
    tokenize_string = [s for s in tokens if not s.lower() in stop_words_new]
    #tokenize_string
    alpha_string = [re.sub('[^A-Za-z]+', '', s) for s in tokenize_string]
    #alpha_string
    stem_string = [snowball.stem(s) for s in alpha_string]
    new_string = [s for s in stem_string if s not in stop_words_new]
    #stem_string
    final_string = " ".join(new_string)
    #final_string
    return final_string

# Document-term matrix
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(),
                columns=vectorizer.get_feature_names())
    metadata.columns = ["metadata_" + col for col in metadata.columns]
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), 
                                        dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

# Topic model
def top_words_topic_model(df, col):
    df = df[df[col].apply(type)==str]
    # Subset only to examined column
    subset_col = df[["ID", 'Gender', 'Race', col]]
    # Drop missing values
    subset_col = subset_col.dropna()
    
    subset_col['processed_text'] = [process(string) for string in subset_col[col]]
    
    dtm = create_dtm(list_of_strings= subset_col['processed_text'],
                metadata = 
                subset_col[["ID", 'Gender', 'Race']])
    
    topdtm = dtm[[col for col in dtm.columns
               if 'metadata' not in col and col != 'index']].sum(axis=0)


    text_raw_tokens = [wordpunct_tokenize(s) 
                    for s in 
                    subset_col['processed_text']]

    text_raw_dict = corpora.Dictionary(text_raw_tokens)

    corpus_fromdict = [text_raw_dict.doc2bow(s) 
                       for s in text_raw_tokens]
    
    # Visualizing the code 
    ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                    num_topics = 3, id2word=text_raw_dict, 
                                    passes=10, alpha = 'auto',
                                    per_word_topics = True, random_state = 2)

    topics = ldamod.print_topics(num_words = 40)

    list_to_return = [topics, topdtm.sort_values(ascending=False).head(40), ldamod, corpus_fromdict, text_raw_dict]
    return list_to_return
    
# Visualization Function
def visualization(ldamod, corpus, text):
    pyLDAvis.enable_notebook()
    lda_display = gensimvis.prepare(ldamod, corpus, text)
    return pyLDAvis.display(lda_display)


In [7]:
# Analyzing the entire dataset 
topics = top_words_topic_model(merged_short_answer, 'Advice') 

visualization(topics[2], topics[3], topics[4])


  default_term_info = default_term_info.sort_values(


In [8]:
FW = top_words_topic_model(female_white, 'Advice')
#FW[:2]
visualization(FW[2], FW[3], FW[4])


In [9]:
MW = top_words_topic_model(male_white, 'Advice')
#MW[:2]
visualization(MW[2], MW[3], MW[4])


In [10]:
FNW = top_words_topic_model(female_nonwhite, 'Advice')
#FNW[:2]
visualization(FNW[2], FNW[3], FNW[4])


In [11]:
MNW = top_words_topic_model(male_nonwhite, 'Advice')
#MNW[:2]
visualization(MNW[2], MNW[3], MNW[4])

In [12]:
# Plotting the topwords for Overleaf

merged_topwords = pd.DataFrame(topics[1])
#merged_topwords = merged_topwords.rename(columns=["0", "Frequency"])
merged_topwords.columns = ['Frequency']
merged_topwords['Keywords'] = merged_topwords.index
merged_topwords['Rank_Overall'] = np.arange(len(merged_topwords))
merged_topwords['Rank_Overall'] = merged_topwords['Rank_Overall'] + 1
#merged_topwords
# merged_topwords.plot(kind='bar', title = "Top Words for Entire Dataset", color = "Red")
# plt.show()

FW_topwords = pd.DataFrame(FW[1])
FW_topwords.columns = ['Frequency']
FW_topwords['Keywords'] = FW_topwords.index
FW_topwords['Rank_FW'] = np.arange(len(FW_topwords))
FW_topwords['Rank_FW'] = FW_topwords['Rank_FW'] + 1
#FW_topwords
# FW_topwords.plot(kind='bar', title = "Top Words for White Females", color = "Green")
# plt.show()

MW_topwords = pd.DataFrame(MW[1])
MW_topwords.columns = ['Frequency']
MW_topwords['Keywords'] = MW_topwords.index
MW_topwords['Rank_MW'] = np.arange(len(MW_topwords))
MW_topwords['Rank_MW'] = MW_topwords['Rank_MW'] + 1
#MW_topwords
# MW_topwords.plot(kind='bar', title = "Top Words for White Males", color = "Blue")
# plt.show()

FNW_topwords = pd.DataFrame(FNW[1])
FNW_topwords.columns = ['Frequency']
FNW_topwords['Keywords'] = FNW_topwords.index
FNW_topwords['Rank_FNW'] = np.arange(len(FNW_topwords))
FNW_topwords['Rank_FNW'] = FNW_topwords['Rank_FNW'] + 1
#FNW_topwords
# FNW_topwords.plot(kind='bar', title = "Top Words for Non-White Females", color = "Purple")
# plt.show()

MNW_topwords = pd.DataFrame(MNW[1])
MNW_topwords.columns = ['Frequency']
MNW_topwords['Keywords'] = MNW_topwords.index
MNW_topwords['Rank_MNW'] = np.arange(len(MNW_topwords))
MNW_topwords['Rank_MNW'] = MNW_topwords['Rank_MNW'] + 1
#MNW_topwords
# MNW_topwords.plot(kind='bar', title = "Top Words for Non-White Males", color = "Orange")
# plt.show()

# merged_topwords.columns

df = merged_topwords.merge(FW_topwords, on = 'Keywords', how = 'left')
df = df.merge(MW_topwords, on = 'Keywords', how = 'left')
df = df.merge(FNW_topwords, on = 'Keywords', how = 'left')
df = df.merge(MNW_topwords, on = 'Keywords', how = 'left')
df = df.drop(['Frequency_x', 'Frequency_y', 'Frequency'], axis=1)
df = df.fillna(0)
df = df.astype({"Rank_FW":"int","Rank_MW":"int","Rank_FNW":"int","Rank_MNW":"int" })
df = df.replace(0, "NR")
df
#print(df.to_latex(index=False))  

  return merge(


Unnamed: 0,Keywords,Rank_Overall,Rank_FW,Rank_MW,Rank_FNW,Rank_MNW
0,help,1,1,2,7,1
1,support,2,9,1,4,8
2,none,3,7,12,1,4
3,individu,4,NR,4,NR,2
4,answer,5,15,9,3,12
5,avail,6,18,5,NR,11
6,get,7,16,7,12,9
7,time,8,26,13,22,3
8,inform,9,2,20,6,13
9,parent,10,NR,8,19,10


In [13]:
# Table for Overleaf (Topics)
all_dict = {'Topic 1': ['help', 'support', 'listen', 'individu', 'avail'], 
           'Topic 2': ['none', 'answer', 'time', 'understand', 'crisi'],
           'Topic 3': ['inform', 'access', 'help/support', 'mental', 'resourc']}
all_df = pd.DataFrame(all_dict)
wf_dict = {'Topic 1': ['help', 'find', 'health', 'support', 'make'], 
           'Topic 2': ['inform', 'system' ,'peopl', 'medic', 'dysregul'],
           'Topic 3': ['none', 'medic', 'time', 'get', 'appoint']}
wf_df = pd.DataFrame(wf_dict)
wm_dict = {'Topic 1': ['help', 'avail', 'crisi', 'support', 'none'], 
           'Topic 2': ['support', 'answer', 'individu', 'work', 'member'],
           'Topic 3': ['earli', 'like', 'peopl', 'time', 'coordin']}
wm_df = pd.DataFrame(wm_dict)
nwf_dict = {'Topic 1': ['support', 'member', 'profession', 'know', 'discourag'], 
           'Topic 2': ['help', 'behavior', 'make', 'listen', 'peopl'],
           'Topic 3': ['none', 'answer', 'infor', 'listen', 'mental']}
nwf_df = pd.DataFrame(nwf_dict)


In [14]:
# Pulling examples for male white topic
look = male_white[['ID', 'Advice']].dropna()
male_white_sub = look[look['Advice'].str.contains('time')==True]
male_white_sub


Unnamed: 0,ID,Advice
57,844088C,Take time to build rapport with him
88,150167,To be available at all times for the individual because she does not want to call 911. She wants a provider that can be reached.
146,8247381,"In my case, I have had a hard time finding a good coordinator to handle my case. I was unfortunate to have a bad SDRC coordinator but then was assigned a new one, Ursula. START, luckily Lubna Waraich came on, I feel like since my coordinators have changed, getting everyone and services I need have been more cohesive, since they are always in touch. I think that this is what people like us need, good providers, or coordinators that care about our family. Otherwise, I would be doing everything on my own, and after awhile that is tiresome, and it is not easy. People need support like these two, or you just want to give up."
264,432163,None at this time
333,48535,Need to be more timely
404,Y384729,There needs to be temporary respite providers to assist with helping in times of crisis. Parental support for respite.
408,406637,Couldnâ€™t think of anything at the time
427,442401,Service providers should take the time to see that there could be more issues other than disability.
432,116600,to give/allow more time for treatment appointments
443,1275560,Understanding of time barriers for single parents raising a child who has a disability
