## Editing the dataset

In order to reduce redundancy and normalize data. 

1. Remove all rows with country not United States

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
data = pd.read_csv('dataset.csv')
data

FileNotFoundError: File b'dataset.csv' does not exist

In [None]:
# Removing columns not used in the analysis and rows with NaN values
data = data.drop(columns = ['self_employed', 'no_employees', 'tech_company', 'age', 'gender', 'remote_work', 'phys_health_consequence'])
data = data.dropna()

# Removing column 'country' after keeping 'United States of America' values
data = data.drop(columns = 'country')

data

In [None]:
data_shuffled = data.reindex(np.random.permutation(data.index))

In [None]:
# Code to normalize the data such that number of values from each state is the same
# Loops over the dataframe to find the minimum number of values from any state

In [None]:
# Dictionary to store the number of data items of each state
no_values_states = {}

for index, row in data_shuffled.iterrows():
  state = row['state']
  if state in no_values_states:
    no_values_states[state] = no_values_states[state] + 1
  else:
    no_values_states[state] = 1

# Checks the minimum number of values of a state
min_values = min(no_values_states.values())

print(no_values_states)
print(min(no_values_states.values()))

In [None]:
# Deletes the states which have less than 10 values in the dataset
select_states = {}
for st in no_values_states:
    if no_values_states[st] >= 10:
        select_states[st] = 10
        
print(select_states)

In [None]:
# Loops over the dataframe, deletes elements with states not in no_values_states
delete_index = []
for index, row in data_shuffled.iterrows():
    state = row['state']
    
    if state in select_states:
        if select_states[state] <= 0:
            delete_index.append(index)
        else:
            select_states[state] = select_states[state] - 1
    else:
        delete_index.append(index)

print(delete_index)

In [None]:
data_normal = data_shuffled.drop(delete_index)

## Data Visualization

1. Bar Graph
This is a stacked bar graph. There are three bars for every state

2. Scatter Plot
Three scatter plots between number of people with mental disorder and factors benefits, wellness_program, seek_help

In [None]:
import matplotlib.pyplot as plt

In [None]:
# To prepare the dataset for stacked representation
# Dictionary storing the number of 'Yes' answers for every state
mental_disorder_yes = {}
# Dictionary storing the number of 'No' answers for every state
mental_disorder_no = {}

for index, row in data.iterrows():
    state = row['state']

    if row['mental_disorder_current'] == 'Yes':
        if state in mental_disorder_yes:
            mental_disorder_yes[state] = mental_disorder_yes[state] + 1
        else:
            mental_disorder_yes[state] = 1
    
    if row['mental_disorder_current'] == 'No':
        if state in mental_disorder_no:
            mental_disorder_no[state] = mental_disorder_no[state] + 1
        else:
            mental_disorder_no[state] = 1

print(mental_disorder_yes)
print(mental_disorder_no)

In [None]:
mental_disorder = {}

for index, row in data.iterrows():
    state = row['state']
    if state not in mental_disorder:
        mental_disorder[state] = [0, 0]
        
    if row['mental_disorder_current'] == 'Yes':
        mental_disorder[state][0] = mental_disorder[state][0] + 1
    elif row['mental_disorder_current'] == 'No':
        mental_disorder[state][1] = mental_disorder[state][1] + 1

state_list = []
yes_list = []
no_list = []

for state in mental_disorder:
    state_list.append(state)
    yes_list.append(mental_disorder[state][0])
    no_list.append(mental_disorder[state][1])


In [None]:
import matplotlib.pyplot as plt

num_bars = len(mental_disorder)

ind = np.arange(num_bars)
width = 0.3

p1 = plt.bar(ind, yes_list, width)
p2 = plt.bar(ind, no_list, width, bottom=yes_list)

plt.title('Number of people suffering from mental disorder per state')
plt.ylabel('Mental disorders')
plt.xticks(ind, state_list)
plt.yticks(np.arange(0, 101, 10))
plt.legend((p1[0], p2[0]), ('Yes', 'No'))

plt.figure(figsize = (1000, 100))
#plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt


N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, menMeans, width, yerr=menStd)
p2 = plt.bar(ind, womenMeans, width, bottom=menMeans, yerr=womenStd)

plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Men', 'Women'))

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)

N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

p1 = ax.bar(ind, menMeans, width, color=(0.2588,0.4433,1.0))
p2 = ax.bar(ind, womenMeans, width, color=(1.0,0.5,0.62),
             bottom=menMeans)
ax.set_ylabel('Scores')
ax.set_xlabel('Groups')
ax.set_title('Scores by group and gender')

ax.set_xticks(ind + width/2.)
ax.set_yticks(np.arange(0, 81, 10))
ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))

plotly_fig = tls.mpl_to_plotly( mpl_fig )

# For Legend
plotly_fig["layout"]["showlegend"] = True
plotly_fig["data"][0]["name"] = "Men"
plotly_fig["data"][1]["name"] = "Women"
py.iplot(plotly_fig, filename='stacked-bar-chart')

In [None]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

inc_data = data_normal('benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'coworkers', 'supervisor','mental_vs_physical', 'obs_consequence','family_history', 'mental_disorder_past', 'mental_disorder_current','mental_health_diagnosis', 'work_interfere', 'state')
# convert all features to categorical integer values
enc = LabelEncoder()
for i in inc_data.columns:
    inc_data[i] = enc.fit_transform(inc_data[i])
    
# target is stored in y
y = inc_data['mental_health_consequence']

# X contains all other features, which we will use to predict target
X = inc_data('benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave','coworkers', 'supervisor','mental_vs_physical', 'obs_consequence','family_history', 'mental_disorder_past', 'mental_disorder_current','mental_health_diagnosis', 'work_interfere', 'state', axis=1)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, y_train)

# make predictions on test set
pred_logit = logit.predict(X_test)
pred_logit

# measure accuracy
accuracy_score(y_true = y_test, y_pred = pred_logit)