In [138]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

In [139]:
df = pd.read_csv('./4_1_kagle_dataset/3_survey/1_data/survey.csv')

In [140]:
#データの確認
pd.set_option('display.max_columns', 50)
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [141]:
#欠損値の確認
#null率を調べる
def nullInvestigation(df):
    for i in df.columns:
        if df[i].isnull().sum() > 0:
            print("--- " + i +" ---")
            print("NULL数:" + str(df[i].isnull().sum()) 
                  +"　　　　NULL率:" + str("{:.3}".format((df[i].isnull().sum()/len(df)).round(3)*100)) + "%" 
                  +"    データの種類数:" + str(df[i].value_counts().count()))
            print("")
nullInvestigation(df)

--- state ---
NULL数:515　　　　NULL率:40.9%    データの種類数:45

--- self_employed ---
NULL数:18　　　　NULL率:1.4%    データの種類数:2

--- work_interfere ---
NULL数:264　　　　NULL率:21.0%    データの種類数:4

--- comments ---
NULL数:1095　　　　NULL率:87.0%    データの種類数:160



In [142]:
#Timestamp,Country,state,commentsは関係ないと仮定して削除
df = df.drop(["Timestamp","Country","state","comments"], axis=1)

In [143]:
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data tpe
intFeatures = ['Age']
stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']
floatFeatures = []

# Clean the NaN's
for feature in df:
    if feature in intFeatures:
        df[feature] = df[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        df[feature] = df[feature].fillna(defaultString)
    elif feature in floatFeatures:
        df[feature] = df[feature].fillna(defaultFloat)
    else:
        print('Error: Feature %s not recognized.' % feature)
df.head(5)

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [144]:
#Genderを小文字に統一しユニーク値を確認する
gender = df['Gender'].str.lower()
print(gender.unique())

['female' 'm' 'male' 'male-ish' 'maile' 'trans-female' 'cis female' 'f'
 'something kinda male?' 'cis male' 'woman' 'mal' 'male (cis)'
 'queer/she/they' 'non-binary' 'femake' 'make' 'nah' 'all' 'enby' 'fluid'
 'genderqueer' 'female ' 'androgyne' 'agender' 'cis-female/femme'
 'guy (-ish) ^_^' 'male leaning androgynous' 'male ' 'man' 'trans woman'
 'msle' 'neuter' 'female (trans)' 'queer' 'female (cis)' 'mail'
 'a little about you' 'malr' 'p' 'femail' 'cis man'
 'ostensibly male, unsure what that really means']


In [145]:
male_str = ['m','male','male-ish','maile','cis male','mal','male (cis)','make','guy (-ish) ^_^','male ','man','msle','mail','malr','cis man']
female_str = ['f','female','cis female','woman','femake','female','cis-female/femme','female (cis)','femail',]
for (row, col) in df.iterrows():
    if str.lower(col.Gender) in male_str:
        df['Gender'].replace(to_replace=col.Gender, value='m', inplace=True)
    elif str.lower(col.Gender) in female_str:
        df['Gender'].replace(to_replace=col.Gender, value='f', inplace=True)
    else:
        df['Gender'].replace(to_replace=col.Gender, value='unknown', inplace=True)

In [146]:
df['Gender'].unique()

array(['f', 'm', 'unknown'], dtype=object)

In [147]:
df['self_employed'] = df['self_employed'].replace([defaultString], 'No')
print(df['self_employed'].unique())
df['work_interfere'] = df['work_interfere'].replace([defaultString], 'Don\'t know' )
print(df['work_interfere'].unique())

['No' 'Yes']
['Often' 'Rarely' 'Never' 'Sometimes' "Don't know"]


In [148]:
#LabelEncoder
for feature in df:
    le = preprocessing.LabelEncoder()
    le.fit(df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    df[feature] = le.transform(df[feature])

df.head()
# 元に戻すときは、
#for feature in df:
#    le = preprocessing.LabelEncoder()
#    le.inverse_trainsform(df[feature])

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,25,0,0,0,1,2,4,0,1,2,1,1,2,2,2,1,1,1,2,1,0,2,0
1,32,1,0,0,0,3,5,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
2,20,1,0,0,0,3,4,0,1,1,0,1,1,0,1,1,1,2,2,2,2,1,0
3,19,1,0,1,1,2,2,0,1,1,2,1,1,1,1,2,2,1,0,0,0,1,1
4,19,1,0,0,0,1,1,1,1,2,0,0,0,0,0,1,1,1,2,2,2,0,0


In [149]:
df.corr().style.background_gradient().format('{:.2f}')

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
Age,1.0,0.06,0.07,0.01,0.08,0.04,0.03,0.14,-0.05,0.15,0.11,0.1,0.12,0.02,-0.02,0.02,-0.05,-0.02,-0.01,0.06,-0.02,-0.02,0.07
Gender,0.06,1.0,0.05,-0.12,-0.15,-0.09,0.02,0.0,0.07,-0.09,-0.09,0.01,-0.0,-0.02,0.05,0.04,0.04,0.06,0.07,-0.03,-0.0,-0.0,-0.03
self_employed,0.07,0.05,1.0,0.01,0.02,0.04,-0.34,0.32,0.08,-0.05,0.05,0.01,0.04,0.11,0.18,0.03,0.03,0.08,0.04,-0.01,-0.02,0.14,0.08
family_history,0.01,-0.12,0.01,1.0,0.38,0.32,-0.05,0.01,-0.05,0.13,0.11,0.07,0.05,0.06,0.02,0.03,0.0,-0.0,0.0,0.04,0.04,0.04,0.12
treatment,0.08,-0.15,0.02,0.38,1.0,0.61,-0.05,0.03,-0.03,0.23,0.24,0.09,0.09,0.14,0.06,0.03,-0.01,0.07,-0.04,0.1,0.05,0.06,0.16
work_interfere,0.04,-0.09,0.04,0.32,0.61,1.0,-0.06,0.03,0.01,0.13,0.16,0.09,0.09,0.06,0.05,0.06,-0.02,0.0,-0.1,0.1,-0.02,0.05,0.13
no_employees,0.03,0.02,-0.34,-0.05,-0.05,-0.06,1.0,-0.21,-0.11,0.12,-0.01,0.09,0.06,-0.01,-0.1,-0.01,-0.08,-0.09,-0.05,0.01,0.03,-0.03,-0.02
remote_work,0.14,0.0,0.32,0.01,0.03,0.03,-0.21,1.0,0.13,-0.06,0.01,-0.07,-0.03,-0.0,0.1,0.05,-0.01,0.08,0.03,-0.03,-0.01,0.04,-0.04
tech_company,-0.05,0.07,0.08,-0.05,-0.03,0.01,-0.11,0.13,1.0,-0.05,-0.03,-0.12,-0.07,-0.05,0.05,0.0,0.07,0.08,0.05,-0.04,-0.03,0.03,-0.06
benefits,0.15,-0.09,-0.05,0.13,0.23,0.13,0.12,-0.06,-0.05,1.0,0.44,0.32,0.38,0.34,0.07,-0.01,-0.03,-0.01,0.03,0.04,0.03,0.14,0.07


In [150]:
feature_cols = ['Age', 'Gender', 'self_employed','family_history', 'benefits', 'care_options', 'anonymity', 'leave', 'work_interfere']
X = df[feature_cols]
y = df.treatment

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [152]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [153]:
#精度
print (lr.score(X_test, y_test))

0.791005291005291


In [154]:
print (lr.intercept_)
print (lr.coef_)

[-2.76795059]
[[ 0.01292961 -0.58677654 -0.10984497  1.17946346  0.17261403  0.24035071
   0.08653148  0.01082936  0.90808718]]


In [155]:
w_0 = lr.intercept_[0]
w_1 = lr.coef_[0,0]
w_2 = lr.coef_[0,1]

In [156]:
#plt.plot([-2,2], map(lambda x: (-w_1 * x - w_0)/w_2, [-2,2]))

In [157]:
y_pred = lr.predict(X)
y_pred

array([1, 0, 0, ..., 1, 0, 1])

In [158]:
param_grid = {'alpha':[0.00000001,0.05,0.06,0.07,0.09,0.1]}
cv = GridSearchCV(Lasso(),param_grid=param_grid,cv=10)
cv.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-08, 0.05, 0.06, 0.07, 0.09, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [159]:
cv.best_params_

{'alpha': 1e-08}

In [160]:
cv.best_estimator_

Lasso(alpha=1e-08, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [161]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = cv.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("MSE=%s"%round(mse,3) )
print("RMSE=%s"%round(np.sqrt(mse), 3) )
print("MAE=%s"%round(mae,3) )

MSE=0.147
RMSE=0.384
MAE=0.297


In [162]:
cv.grid_scores_



[mean: 0.43121, std: 0.07693, params: {'alpha': 1e-08},
 mean: 0.38159, std: 0.06275, params: {'alpha': 0.05},
 mean: 0.37928, std: 0.06187, params: {'alpha': 0.06},
 mean: 0.37732, std: 0.06069, params: {'alpha': 0.07},
 mean: 0.37209, std: 0.05830, params: {'alpha': 0.09},
 mean: 0.36899, std: 0.05713, params: {'alpha': 0.1}]

|値|説明文|データ型|
|:--|--:|:--:|
|Timestamp|アンケートの提出時間|Numeric|
|Age|回答者の年齢|Numeric|
|Gender|被告人の性別|String|
|Country|被告国|String|
|state|あなたがアメリカ合衆国に住んでいる場合、あなたはどの州または地域に住んでいますか？|String|
|self_employed|あなたは自営業ですか？|String|
|family_history|精神病の家族歴はありますか？|String|
|treatment|あなたは精神的健康状態の治療を求めましたか？
|work_interfere|あなたが精神的健康状態にある場合、それはあなたの仕事を妨害すると感じますか？
|no_employees|あなたの会社または組織の従業員は何人ですか？|String|
|remote_work|時間の少なくとも50％はリモートで（オフィスの外で）働いていますか？|String|
|tech_company|雇用主は主にハイテク企業/組織ですか？|String|
|benefits|あなたの雇用主は精神保健給付を提供していますか？|String|
|care_options|あなたの雇用主が提供するメンタルヘルスケアのオプションを知っていますか？
|wellness_program|あなたの雇用主は従業員ウェルネスプログラムの一環としてメンタルヘルスについて議論しましたか？
|seek_help|あなたの雇用主は、メンタルヘルスの問題と、助けを求める方法についてもっと学ぶためのリソースを提供していますか？
|anonymity|メンタルヘルスや物質乱用の治療リソースを利用することを選択した場合、あなたの匿名性は保護されますか？
|leave|あなたが精神的健康状態のために休暇を取ることはどれくらい簡単ですか？|String|
|mental_health_consequence|あなたの雇用主との精神衛生問題について議論することは、否定的な結果をもたらすと思いますか？
|phys_health_consequence|あなたの雇用主との健康上の問題について議論することは、否定的な結果につながると思いますか？
|coworkers|あなたの同僚との精神衛生上の問題について話し合いたいですか？|String|
|supervisor|あなたの直属の上司と精神衛生上の問題について話し合うことはできますか？|String|
|mental_health_interview|インタビューで潜在的な雇用主との精神衛生上の問題を提起しますか？
|phys_health_interview|インタビューで潜在的な雇用主との身体的健康問題を提起しますか？
|mental_vs_physical|あなたの雇用主が精神的健康を肉体的健康と同じくらい重視していると感じますか？
|obs_consequence|あなたの職場で精神的健康状態の同僚のために否定的な結果が聞かれましたか、または観察されましたか？
|comments|追加の注釈またはコメント|String|