In [None]:
# https://noumenon-th.net/programming/2016/04/27/logisticregression/
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
%matplotlib inline

In [None]:
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
#利用しない変数は削除
train_df = train_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)
test_df = test_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

## 欠損値の補完

In [None]:
#年齢の欠損値を男女の平均年齢で補間
age_train_mean = train_df.groupby('Sex').Age.mean()
 
def fage(x):
    if x.Sex == 'male':
        return round(age_train_mean['male'])
    if x.Sex == 'female':
        return round(age_train_mean['female'])

train_df.Age.fillna(train_df[train_df.Age.isnull()].apply(fage,axis=1),inplace=True)


In [None]:
age_test_mean = test_df.groupby('Sex').Age.mean()
 
def fage(x):
    if x.Sex == 'male':
        return round(age_test_mean['male'])
    if x.Sex == 'female':
        return round(age_test_mean['female'])

test_df.Age.fillna(test_df[test_df.Age.isnull()].apply(fage,axis=1),inplace=True)

## ダミー変数

In [None]:
#クロス集計
sex_ct = pd.crosstab(train_df['Sex'], train_df['Survived'])
sex_ct

In [None]:
#Femaleカラムを追加し、Sex要素のmale/femaleを1/0に変換して、要素として追加する
train_df['Female'] = train_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
test_df['Female'] = test_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
 
train_df.head()

In [None]:
#クロス集計
pclass_ct = pd.crosstab(train_df['Pclass'], train_df['Survived'])
pclass_ct

In [None]:
#Pclassをダミー変数で分ける
pclass_train_df  = pd.get_dummies(train_df['Pclass'],prefix='Class')
pclass_test_df  = pd.get_dummies(test_df['Pclass'],prefix='Class')
 
pclass_train_df.head()

In [None]:
#Class_3を削除
pclass_train_df = pclass_train_df.drop(['Class_3'], axis=1)
pclass_test_df = pclass_test_df.drop(['Class_3'], axis=1)
 
#Class_1,Class_2カラムを追加
train_df = train_df.join(pclass_train_df)
test_df = test_df.join(pclass_test_df)

In [None]:
#訓練用
train_df.head()

In [None]:
#テスト用
test_df.head()

## モデル生成と予測

In [None]:
X = train_df.drop(['PassengerId','Survived','Pclass','Sex'],axis=1)
y = train_df.Survived
 
#モデルの生成
clf = LogisticRegression()
 
#学習
clf.fit(X, y)

In [None]:
#学習したモデルの精度
clf.score(X,y)

#モデルに伴う生存率の予測値
#predict_y = clf.predict(X)
 
#実際の値と予測値の比率
#accuracy_score(y, predict_y)

In [None]:
#変数名とその係数を格納するデータフレーム
coeff_df = DataFrame([X.columns, clf.coef_[0]]).T
coeff_df

In [None]:
#テストデータから生存者を予測
X1 = test_df.drop(['PassengerId','Pclass','Sex'],axis=1)
test_predict = clf.predict(X1)

In [None]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived':np.array(test_predict)})
submission.to_csv('submission.csv', index=False)
submission.head()