ダミー変数に多重共線性が発生しているので、変数の1つを取り除いて分析する

In [2]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame
from sklearn.linear_model import LogisticRegression
# 可視化モジュール
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 小数第３まで表示
%precision 3

'%.3f'

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Pclassによる生存率の違いを調べる

In [7]:
Pclass_groupby = pd.concat([df_train.groupby('Pclass')['Survived'].sum()/df_train.groupby('Pclass')['Survived'].count(),
                           df_train.groupby('Pclass')['Survived'].count()],axis = 1)
Pclass_groupby.columns = ['Survived_rate', 'num_of_passenger']
Pclass_groupby

Unnamed: 0_level_0,Survived_rate,num_of_passenger
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.62963,216
2,0.472826,184
3,0.242363,491


使わない項目を落とす

In [8]:
df_train =  df_train.drop(["Name","Ticket","Cabin"],axis = 1)

In [9]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


ダミー変数を作成し、元のデータに結合する

In [10]:
dummy_train = pd.get_dummies(df_train[['Sex','Embarked']])

In [11]:
df_train_dummy = pd.concat([df_train.drop(['Sex','Embarked'],axis = 1),dummy_train],axis =1)
df_train_dummy

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.0,0,0,8.0500,0,1,0,0,1
5,6,0,3,,0,0,8.4583,0,1,0,1,0
6,7,0,1,54.0,0,0,51.8625,0,1,0,0,1
7,8,0,3,2.0,3,1,21.0750,0,1,0,0,1
8,9,1,3,27.0,0,2,11.1333,1,0,0,0,1
9,10,1,2,14.0,1,0,30.0708,1,0,1,0,0


ダミー変数の1つを取り除く

In [13]:
df_train_dummy =  df_train_dummy.drop(["Sex_male","Embarked_S"],axis = 1)
df_train_dummy

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q
0,1,0,3,22.0,1,0,7.2500,0,0,0
1,2,1,1,38.0,1,0,71.2833,1,1,0
2,3,1,3,26.0,0,0,7.9250,1,0,0
3,4,1,1,35.0,1,0,53.1000,1,0,0
4,5,0,3,35.0,0,0,8.0500,0,0,0
5,6,0,3,,0,0,8.4583,0,0,1
6,7,0,1,54.0,0,0,51.8625,0,0,0
7,8,0,3,2.0,3,1,21.0750,0,0,0
8,9,1,3,27.0,0,2,11.1333,1,0,0
9,10,1,2,14.0,1,0,30.0708,1,1,0


欠損値の加工

In [14]:
df_train_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Sex_female     891 non-null uint8
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
dtypes: float64(2), int64(5), uint8(3)
memory usage: 51.4 KB


In [15]:
df_train_dummy['Age'] = df_train_dummy['Age'].fillna(df_train_dummy['Age'].mean())

In [16]:
df_train_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Sex_female     891 non-null uint8
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
dtypes: float64(2), int64(5), uint8(3)
memory usage: 51.4 KB


テストデータについても同様に加工

In [17]:
df_test =  df_test.drop(["Name","Ticket","Cabin"],axis = 1)
dummy_test = pd.get_dummies(df_test[['Sex','Embarked']])
df_test_dummy = pd.concat([df_test.drop(['Sex','Embarked'],axis = 1),dummy_test],axis =1)
df_test_dummy['Age'] = df_test_dummy['Age'].fillna(df_test_dummy['Age'].mean())
df_test_dummy['Fare'] = df_test_dummy['Fare'].fillna(df_test_dummy['Fare'].mean())
df_test_dummy = df_test_dummy.drop('PassengerId',axis = 1)
df_test_dummy =  df_test_dummy.drop(["Sex_male","Embarked_S"],axis = 1)

モデル作成

In [18]:
X_train = df_train_dummy.drop(['Survived','PassengerId'],axis = 1)
y_train = df_train_dummy.Survived

log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

モデル評価

In [19]:
bench_mark = df_train_dummy.Survived.sum()/df_train_dummy.Survived.count()

print("bench_mark :",bench_mark)
print("training score :",log_reg.score(X_train,y_train))

bench_mark : 0.3838383838383838
training score : 0.8013468013468014


In [20]:
df_test_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass        418 non-null int64
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          418 non-null float64
Sex_female    418 non-null uint8
Embarked_C    418 non-null uint8
Embarked_Q    418 non-null uint8
dtypes: float64(2), int64(3), uint8(3)
memory usage: 17.6 KB


In [21]:
survived_predict = log_reg.predict(df_test_dummy)
survived_predict[:5]

array([0, 0, 0, 0, 1], dtype=int64)

In [22]:
df_survived_predict = pd.DataFrame({'Survived':survived_predict})

In [23]:
submission_file = pd.concat([df_test.PassengerId, df_survived_predict.Survived],axis=1)
submission_file.columns = ['PassengerId','Survived']
submission_file.to_csv('submission.csv',index=False)

In [24]:
submission_file

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [25]:
feature_coef = pd.concat([pd.DataFrame(X_train.columns), pd.DataFrame(log_reg.coef_[0, :])], axis=1)
feature_coef.columns = ['feature name', 'coefficient']
feature_coef['abs_coefficient'] = abs(feature_coef.coefficient)
feature_coef.sort_values(by='abs_coefficient', ascending=False).drop('abs_coefficient', axis=1)

Unnamed: 0,feature name,coefficient
5,Sex_female,2.616173
0,Pclass,-0.968746
6,Embarked_C,0.411649
7,Embarked_Q,0.321381
2,SibSp,-0.307715
3,Parch,-0.088644
1,Age,-0.034265
4,Fare,0.003022
