## Imports

In [8]:
import kaggle
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

#graphics
from mlxtend.plotting import plot_decision_regions  #visualize decisions
import matplotlib.pyplot as plt
from graphviz import Source
from IPython.display import SVG
from sklearn.tree import export_graphviz
import seaborn as sns
sns.set()
sns.set_palette(sns.color_palette("colorblind"))
%matplotlib notebook

pd.set_option("display.max_rows", None, "display.max_columns", None)
kaggle.api.authenticate()

In [9]:
!kaggle datasets download osmihelp/osmi-mental-health-in-tech-survey-2019 --unzip

Downloading osmi-mental-health-in-tech-survey-2019.zip to /home/tanglef/Documents/Universite/m2/datacamp
100%|███████████████████████████████████████| 93.9k/93.9k [00:00<00:00, 285kB/s]
100%|███████████████████████████████████████| 93.9k/93.9k [00:00<00:00, 284kB/s]


## Cleaning the dataset

In [10]:
col_names = ["self_employed", "nb_employees", "type_employer", "tech_role",
             "mh_coverage", "know_options_mh_cov", "talk_with_employer_mh_formally",
             "ressources_available", "anonymity", "ability_to_leave_bc_of_mh",
             "comfort_talk_mh_or_ph_coworkers", "comfort_talk_mh_supervisor",
             "talked_with_employer_your_mh", "comfort_coworkers_talk",
             "talked_with_coworkers_your_mh", "coworker_talked_someone_mh",
             "importance_ph_for_employer", "importance_mh_for_employer", "not_first_job",
             "previous_job_in_tech", "previous_job_mh_cov", "previous_job_know_options",
             "previous_job_talk_with_employer_mh_formally", "previous_job_essources_available",
             "previous_job_anonymity", "previous_job_more_comfort_employer_talk_mh_ph",
             "previous_job_talk_mh_supervisor", "previous_job_talked_with_employer_your_mh",
             "previous_job_comfort_coworkers_talk", "previous_job_talked_with_coworkers_your_mh",
             "previous_job_coworker_talked_someone_mh", "previous_job_importance_ph_for_employer",
             "previous_job_importance_mh_for_employer", "current_mh_disorder", "past_mh_disorder",
             "professional_treatment", "family_history_mh", "freq_interference_mh_work_treated",
             "freq_interference_mh_work_not_treated", "observations_prevented_you_reveal_your_mh",
             "share_with_friends_family_mh", "bring_up_your_ph_interview", "why",
             "bring_up_your_mh_interview", "why2", "openly_workid_with_your_mh_issue",
             "team_reponse_if_knew_your_mh_issue", "observed_unsupportive_response_to_mh_issue",
             "observed_supportive_reponse_to_mh_issue", "overall_tech_mh_support", "improvements",
             "talk_more", "age", "gender", "live_country", "live_state", "race", "work_country",
             "work_state"]

male = ["Male", 'male', 'm', 'M', 'Identify as male', 'Male ', 'Masculine', 'Cishet male',
        'Man', "Cis Male", 'masculino', 'Make', "CIS Male"]

female = ['Female', 'female', 'f', 'F', 'woman',
          'cis woman', 'Female ', 'Female (cis)', "Woman"]

genderqueer = ["Let's keep it simple and say \"male\"", "Non-binary", "Non binary",
               "None", "agender", "Questioning", "Agender trans woman", "Trans man",
               "Trans non-binary/genderfluid", "Non-binary and gender fluid"]

drop_col = ["self_employed", "not_first_job", "why", "why2", "improvements"]

In [11]:
def preprocess_df(df):
    # remove columns with a majority of nan
    df.dropna(thresh=int(df.shape[0] / 2), axis=1, inplace=True) # 82 -> 59 columns

    # change column names
    df.columns = col_names

    # remove self-employed and first jobs
    indexSelf_employed = df[ df['self_employed'] == True ].index
    df.drop(indexSelf_employed, inplace=True)
    indexFirst_job = df[ df["not_first_job"] == False].index
    df.drop(indexFirst_job, inplace=True)
    index_gender_error = df[ df["gender"] == '43'].index
    df.drop(index_gender_error, inplace=True)
    
    # gender categories
    
    df['gender'].replace(to_replace=male,
                        value='Male',
                        inplace=True)
    df['gender'].replace(to_replace=female,
                        value='Female',
                        inplace=True)
    df['gender'].replace(to_replace=genderqueer,
                        value='Genderqueer/NonBinary/Trans',
                        inplace=True)
    
    # drop columns not needed
    df.drop(drop_col, axis=1, inplace=True)

    # replace nan values with mean or most answered
    need_onehot = []
    for i in df.columns:
        if df[i].dtype == 'float64' or df[i].dtype == 'int64':
            imput = SimpleImputer(missing_values=np.nan, strategy='mean')
        else:
            need_onehot.append(i)
            imput = SimpleImputer(missing_values='nan', strategy='most_frequent')
            df[i] = df[i].astype(str)
        df[[i]] = imput.fit_transform(df[[i]])
    
    return df, need_onehot

In [12]:
df = pd.read_csv("OSMI 2019 Mental Health in Tech Survey Results - OSMI Mental Health in Tech Survey 2019.csv")
print("The original survey was on n={} peoplee with p={} questions".format(df.shape[0], df.shape[1]))
df, need_onehot = preprocess_df(df)
n, p = df.shape
print("We have the results for n={} people over p={} questions.".format(n, p))
df.head()

The original survey was on n=352 peoplee with p=82 questions
We have the results for n=252 people over p=54 questions.


Unnamed: 0,nb_employees,type_employer,tech_role,mh_coverage,know_options_mh_cov,talk_with_employer_mh_formally,ressources_available,anonymity,ability_to_leave_bc_of_mh,comfort_talk_mh_or_ph_coworkers,comfort_talk_mh_supervisor,talked_with_employer_your_mh,comfort_coworkers_talk,talked_with_coworkers_your_mh,coworker_talked_someone_mh,importance_ph_for_employer,importance_mh_for_employer,previous_job_in_tech,previous_job_mh_cov,previous_job_know_options,previous_job_talk_with_employer_mh_formally,previous_job_essources_available,previous_job_anonymity,previous_job_more_comfort_employer_talk_mh_ph,previous_job_talk_mh_supervisor,previous_job_talked_with_employer_your_mh,previous_job_comfort_coworkers_talk,previous_job_talked_with_coworkers_your_mh,previous_job_coworker_talked_someone_mh,previous_job_importance_ph_for_employer,previous_job_importance_mh_for_employer,current_mh_disorder,past_mh_disorder,professional_treatment,family_history_mh,freq_interference_mh_work_treated,freq_interference_mh_work_not_treated,observations_prevented_you_reveal_your_mh,share_with_friends_family_mh,bring_up_your_ph_interview,bring_up_your_mh_interview,openly_workid_with_your_mh_issue,team_reponse_if_knew_your_mh_issue,observed_unsupportive_response_to_mh_issue,observed_supportive_reponse_to_mh_issue,overall_tech_mh_support,talk_more,age,gender,live_country,live_state,race,work_country,work_state
0,26-100,True,True,I don't know,No,Yes,Yes,I don't know,Very easy,Physical health,Yes,False,Yes,True,True,5.0,5.0,False,I don't know,N/A (was not aware),Some did,"Yes, they all did",I don't know,Physical health,Some of my previous supervisors,False,At some of my previous employers,False,True,8.0,5.0,Don't Know,No,False,No,Not applicable to me,Not applicable to me,No,8.0,Yes,Maybe,False,8.0,No,Maybe/Not sure,4.0,False,25.0,Male,United States of America,Nebraska,White,United States of America,Nebraska
1,26-100,True,True,Yes,No,No,Yes,Yes,I don't know,Physical health,Maybe,False,Maybe,True,True,5.0,7.0,False,"Yes, they all did",I was aware of some,Some did,Some did,"Yes, always",Physical health,Some of my previous supervisors,False,At some of my previous employers,True,True,8.0,5.0,Possibly,Possibly,False,Yes,Sometimes,Often,No,7.0,No,No,False,7.0,"Yes, I observed","Yes, I experienced",1.0,False,51.0,Male,United States of America,Nebraska,White,United States of America,Nebraska
2,26-100,True,True,I don't know,No,No,I don't know,I don't know,Somewhat difficult,Same level of comfort for each,No,False,Maybe,True,True,5.0,4.0,True,I don't know,N/A (was not aware),None did,None did,I don't know,Physical health,"No, none of my previous supervisors",False,"No, at none of my previous employers",False,False,10.0,2.0,No,No,False,I don't know,Not applicable to me,Not applicable to me,No,4.0,Maybe,No,False,4.0,Maybe/Not sure,"Yes, I observed",2.0,True,27.0,Male,United States of America,Illinois,White,United States of America,Illinois
3,100-500,True,True,I don't know,No,Yes,Yes,Yes,Very easy,Physical health,Yes,False,Maybe,False,True,1.0,3.0,True,I don't know,N/A (was not aware),Some did,Some did,"Yes, always",Physical health,Some of my previous supervisors,False,At some of my previous employers,False,True,3.0,1.0,No,No,False,Yes,Not applicable to me,Not applicable to me,No,3.0,No,No,False,8.0,No,"Yes, I observed",3.0,False,37.0,Male,United States of America,Nebraska,White,United States of America,Nebraska
4,26-100,True,True,I don't know,No,I don't know,I don't know,I don't know,I don't know,Physical health,No,False,No,False,False,4.0,4.0,True,I don't know,N/A (was not aware),None did,None did,I don't know,Physical health,"No, none of my previous supervisors",False,"No, at none of my previous employers",False,False,4.0,4.0,No,No,False,No,Not applicable to me,Not applicable to me,No,1.0,No,No,False,5.0,No,No,3.0,False,46.0,Male,United States of America,Nebraska,White,United States of America,Nebraska


In [13]:
col_names = df.columns

def colorize(sth):
    if sth in col_names:
        return('\x1b[6;37;40m' + str(sth) + '\x1b[0m')

idx = -1
for idx_, name in enumerate(col_names):
    new_idx = idx_
    if new_idx != idx:
        print("\n" + colorize(name) + "----------------------\n ==========================")
    print(df[name].unique())


[6;37;40mnb_employees[0m----------------------
['26-100' '100-500' '6-25' 'More than 1000' '500-1000' '1-5']

[6;37;40mtype_employer[0m----------------------
['True' 'False']

[6;37;40mtech_role[0m----------------------
['True' 'False']

[6;37;40mmh_coverage[0m----------------------
["I don't know" 'Yes' 'Not eligible for coverage / NA' 'No']

[6;37;40mknow_options_mh_cov[0m----------------------
['No' 'Yes']

[6;37;40mtalk_with_employer_mh_formally[0m----------------------
['Yes' 'No' "I don't know"]

[6;37;40mressources_available[0m----------------------
['Yes' "I don't know" 'No']

[6;37;40manonymity[0m----------------------
["I don't know" 'Yes' 'No']

[6;37;40mability_to_leave_bc_of_mh[0m----------------------
['Very easy' "I don't know" 'Somewhat difficult' 'Somewhat easy'
 'Neither easy nor difficult' 'Difficult']

[6;37;40mcomfort_talk_mh_or_ph_coworkers[0m----------------------
['Physical health' 'Same level of comfort for each' 'Mental health']

[6;37;40

In [14]:
df_dumm = pd.get_dummies(df, drop_first=True, columns=need_onehot) # drop to avoid redundancy
print("We have the results for n={} people over p={} questions in {} one-hot columns.".format(n, p, df_dumm.shape[1]))
df_dumm.head() # numeric columns are now the first ones !

We have the results for n=252 people over p=54 questions in 216 one-hot columns.


Unnamed: 0,importance_ph_for_employer,importance_mh_for_employer,previous_job_importance_ph_for_employer,previous_job_importance_mh_for_employer,share_with_friends_family_mh,team_reponse_if_knew_your_mh_issue,overall_tech_mh_support,age,nb_employees_100-500,nb_employees_26-100,nb_employees_500-1000,nb_employees_6-25,nb_employees_More than 1000,type_employer_True,tech_role_True,mh_coverage_No,mh_coverage_Not eligible for coverage / NA,mh_coverage_Yes,know_options_mh_cov_Yes,talk_with_employer_mh_formally_No,talk_with_employer_mh_formally_Yes,ressources_available_No,ressources_available_Yes,anonymity_No,anonymity_Yes,ability_to_leave_bc_of_mh_I don't know,ability_to_leave_bc_of_mh_Neither easy nor difficult,ability_to_leave_bc_of_mh_Somewhat difficult,ability_to_leave_bc_of_mh_Somewhat easy,ability_to_leave_bc_of_mh_Very easy,comfort_talk_mh_or_ph_coworkers_Physical health,comfort_talk_mh_or_ph_coworkers_Same level of comfort for each,comfort_talk_mh_supervisor_No,comfort_talk_mh_supervisor_Yes,talked_with_employer_your_mh_True,comfort_coworkers_talk_No,comfort_coworkers_talk_Yes,talked_with_coworkers_your_mh_True,coworker_talked_someone_mh_True,previous_job_in_tech_True,"previous_job_mh_cov_No, none did",previous_job_mh_cov_Some did,"previous_job_mh_cov_Yes, they all did",previous_job_know_options_N/A (none offered),previous_job_know_options_N/A (was not aware),"previous_job_know_options_No, I only became aware later","previous_job_know_options_Yes, I was aware of all of them",previous_job_talk_with_employer_mh_formally_None did,previous_job_talk_with_employer_mh_formally_Some did,"previous_job_talk_with_employer_mh_formally_Yes, they all did",previous_job_essources_available_Some did,"previous_job_essources_available_Yes, they all did",previous_job_anonymity_No,previous_job_anonymity_Sometimes,"previous_job_anonymity_Yes, always",previous_job_more_comfort_employer_talk_mh_ph_Physical health,previous_job_more_comfort_employer_talk_mh_ph_Same level of comfort for each,"previous_job_talk_mh_supervisor_No, none of my previous supervisors",previous_job_talk_mh_supervisor_Some of my previous supervisors,"previous_job_talk_mh_supervisor_Yes, all of my previous supervisors",previous_job_talked_with_employer_your_mh_True,"previous_job_comfort_coworkers_talk_No, at none of my previous employers","previous_job_comfort_coworkers_talk_Yes, at all of my previous employers",previous_job_talked_with_coworkers_your_mh_True,previous_job_coworker_talked_someone_mh_True,current_mh_disorder_No,current_mh_disorder_Possibly,current_mh_disorder_Yes,past_mh_disorder_No,past_mh_disorder_Possibly,past_mh_disorder_Yes,professional_treatment_True,family_history_mh_No,family_history_mh_Yes,freq_interference_mh_work_treated_Not applicable to me,freq_interference_mh_work_treated_Often,freq_interference_mh_work_treated_Rarely,freq_interference_mh_work_treated_Sometimes,freq_interference_mh_work_not_treated_Not applicable to me,freq_interference_mh_work_not_treated_Often,freq_interference_mh_work_not_treated_Rarely,freq_interference_mh_work_not_treated_Sometimes,observations_prevented_you_reveal_your_mh_No,observations_prevented_you_reveal_your_mh_Yes,bring_up_your_ph_interview_No,bring_up_your_ph_interview_Yes,bring_up_your_mh_interview_No,bring_up_your_mh_interview_Yes,openly_workid_with_your_mh_issue_True,observed_unsupportive_response_to_mh_issue_No,"observed_unsupportive_response_to_mh_issue_Yes, I experienced","observed_unsupportive_response_to_mh_issue_Yes, I observed",observed_supportive_reponse_to_mh_issue_No,"observed_supportive_reponse_to_mh_issue_Yes, I experienced","observed_supportive_reponse_to_mh_issue_Yes, I observed",talk_more_True,gender_Genderqueer/NonBinary/Trans,gender_Male,live_country_Canada,live_country_Croatia,live_country_Estonia,live_country_France,live_country_Germany,live_country_Greece,live_country_Hong Kong,live_country_India,live_country_Ireland,live_country_Israel,live_country_Japan,live_country_Mexico,live_country_Netherlands,live_country_New Zealand,live_country_Norway,live_country_Pakistan,live_country_Poland,live_country_Portugal,live_country_South Africa,live_country_Spain,live_country_Switzerland,live_country_Turkey,live_country_United Kingdom,live_country_United States of America,live_state_Arizona,live_state_California,live_state_Colorado,live_state_Connecticut,live_state_District of Columbia,live_state_Florida,live_state_Georgia,live_state_Illinois,live_state_Indiana,live_state_Kansas,live_state_Louisiana,live_state_Maryland,live_state_Massachusetts,live_state_Michigan,live_state_Minnesota,live_state_Missouri,live_state_Nebraska,live_state_New Jersey,live_state_New York,live_state_North Carolina,live_state_Ohio,live_state_Oregon,live_state_Pennsylvania,live_state_South Carolina,live_state_Tennessee,live_state_Texas,live_state_Utah,live_state_Vermont,live_state_Virginia,live_state_Washington,live_state_Wisconsin,live_state_Wyoming,race_Black or African American,race_Caucasian,race_Hispanic,race_I prefer not to answer,race_More than one of the above,race_White,work_country_Canada,work_country_Croatia,work_country_Estonia,work_country_France,work_country_Germany,work_country_Greece,work_country_Hong Kong,work_country_India,work_country_Ireland,work_country_Israel,work_country_Japan,work_country_Mexico,work_country_Netherlands,work_country_New Zealand,work_country_Norway,work_country_Other,work_country_Pakistan,work_country_Poland,work_country_Portugal,work_country_South Africa,work_country_Spain,work_country_Switzerland,work_country_Turkey,work_country_United Kingdom,work_country_United States of America,work_state_Arizona,work_state_California,work_state_Colorado,work_state_Connecticut,work_state_Florida,work_state_Georgia,work_state_Illinois,work_state_Indiana,work_state_Kansas,work_state_Louisiana,work_state_Maryland,work_state_Massachusetts,work_state_Michigan,work_state_Minnesota,work_state_Missouri,work_state_Nebraska,work_state_New Jersey,work_state_New York,work_state_North Carolina,work_state_Ohio,work_state_Oregon,work_state_Pennsylvania,work_state_South Carolina,work_state_Tennessee,work_state_Texas,work_state_Utah,work_state_Vermont,work_state_Virginia,work_state_Washington,work_state_Wisconsin,work_state_Wyoming
0,5.0,5.0,8.0,5.0,8.0,8.0,4.0,25.0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5.0,7.0,8.0,5.0,7.0,7.0,1.0,51.0,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5.0,4.0,10.0,2.0,4.0,4.0,2.0,27.0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.0,3.0,3.0,1.0,3.0,8.0,3.0,37.0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4.0,4.0,4.0,4.0,1.0,5.0,3.0,46.0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
