In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
sample_sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
def display_pd(data):
    print(f'Train_df_shape : {data.shape}\n')
    print(f'{data.dtypes} \n')
    display(data.head())

In [None]:
display_pd(train_df)

In [None]:
sample_sub.to_csv('submission.csv',index = False)

In [None]:
# display_pd(train_df)

# passenger_id & pclass are not statistics
train_df = train_df.astype(
    {
        'PassengerId' : str,
        'Pclass' : str 
        }
    )
# display_pd(train_df)

display(train_df.describe())

In [None]:
# show the categorical data
display(train_df.describe(exclude='number'))

In [None]:
def analysis_pd(data):
    data = data.astype(
        {
        'PassengerId' : str,
        'Pclass' : str 
        }
    )
    print('--statistics--')
    display(data.describe())
    print('--categorical--')
    display(data.describe(exclude='number'))
    
    # return changed data <-- after astype
    return data


In [None]:
train_df = analysis_pd(train_df)

In [None]:
test_df = analysis_pd(test_df)

In [None]:
all_df = pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

In [None]:
all_df['Test_Flag'] = 0
all_df.loc[train_df.shape[0]: , 'Test_Flag'] = 1

In [None]:
all_df = analysis_pd(all_df)

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

fig ,axes = plt.subplots(figsize=(8,8),ncols=3,nrows=1)

for col_name,ax in zip(
    ['Sex','Ticket','Embarked']
    ,axes.ravel()
    ):
    venn2(
        # train_dfとtest_dfのユニークな要素を抽出し、セットにする
        subsets=(set(train_df[col_name].unique()), set(test_df[col_name].unique())),
        set_labels=('Train', 'Test'),
        ax=ax
    )
    ax.set_title(col_name)

In [None]:
import seaborn as sns

sns.countplot(x='Survived', data=train_df) 

In [None]:
# Ageについて可視化
fig = sns.FacetGrid(all_df, col='Test_Flag', hue='Test_Flag', height=4)
fig.map(sns.histplot, 'Age', bins=30, kde=False)

In [None]:
fig = sns.FacetGrid(all_df, col='Test_Flag', hue='Test_Flag', height=4)
fig.map(sns.histplot, 'Fare', bins=30, kde=False)

In [None]:
# SibSpについて可視化
sns.countplot(
    x='SibSp'
    ,hue='Test_Flag', data=all_df
    )
plt.show()

# Parchについて可視化
sns.countplot(
    x='Parch'
    ,hue='Test_Flag', data=all_df
    )
plt.legend(title='Test_Flag' ,loc='upper right')
plt.show()

In [None]:
sns.heatmap(
    train_df[['Survived','Age','SibSp','Parch','Fare']].corr(),
    vmax=1,vmin=-1,annot=True
    )

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=train_df) 
plt.show()

In [None]:
# Ageについて可視化
fig = sns.FacetGrid(train_df, col='Survived', hue='Survived', height=4)
fig.map(sns.histplot, 'Age', bins=30, kde=False)

In [None]:
fig = sns.FacetGrid(train_df, col='Survived', hue='Survived', height=4)
fig.map(sns.histplot, 'Fare', bins=25, kde=False)

In [None]:
# SibSpについて可視化
sns.countplot(
    x='SibSp',
    hue='Survived', 
    data=train_df
    )
plt.legend(title='Survived', loc='upper right')
plt.show()

# Parchについて可視化
sns.countplot(
    x='Parch',
    hue='Survived', 
    data=train_df
    )
plt.legend(title='Survived' ,loc='upper right')
plt.show()

In [None]:
all_df['Age'] =  all_df['Age'].fillna( all_df['Age'].median())
all_df['Fare'] =  all_df['Fare'].fillna( all_df['Fare'].median())
all_df['Embarked'] =  all_df['Embarked'].fillna('NaN')

In [None]:
# Ageを4つの区間に分類し、カテゴリカル変数に変換
all_df['AgeBand'] = pd.qcut(all_df['Age'], 4)
# Fareのデータを4つの区間に分類し、カテゴリカル変数として扱う
all_df['FareBand'] = pd.qcut(all_df['Fare'], 4)

In [None]:
# Sex、PclassをOne-Hot_Encodingで変換
all_df = pd.get_dummies(all_df, columns= ["Sex", "Pclass"])

In [None]:
# EmbarkedをOne-Hot Encodingで変換
all_df = pd.get_dummies(all_df, columns=['AgeBand','FareBand','Embarked'])

In [None]:
from sklearn.model_selection import train_test_split

# 前処理を施したall_dfを訓練データとテストデータに分割
train = all_df[all_df['Test_Flag']==0]
test = all_df[all_df['Test_Flag']==1].reset_index(drop=True)
target = train['Survived']

# 今回学習に用いないカラムを削除
drop_col = [
    'PassengerId', 'Age', 
    'Ticket', 'Fare', 'Cabin',
    'Test_Flag', 'Name', 'Survived'
    ]

train = train.drop(drop_col, axis=1)
test = test.drop(drop_col, axis=1)

# 訓練データの一部を検証データに分割
# 注意 :   
# shuffleをTrueにするとランダムに分割されます。
# この時、random_stateを定義していないとモデルの再現性が取れなくなるので、設定するよう心がけてください。
X_train ,X_val ,y_train ,y_val = train_test_split(
    train, target, 
    test_size=0.2, shuffle=True,random_state=0
    )

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# モデルを定義し学習
model = LogisticRegression() 
model.fit(X_train, y_train)

# 訓練データに対しての予測を行い、正答率を算出
y_pred = model.predict(X_train)
print(accuracy_score(y_train, y_pred))

In [None]:
y_pred_val = model.predict(
    X_val
    )

# 予測結果を正答率で評価
print(accuracy_score(
        y_val, y_pred_val
        )
    )

In [None]:
# テストデータを予測
test_pred = model.predict(test)

# 予測結果をサブミットするファイル形式に変更
sample_sub["Survived"] = np.where(test_pred>=0.5, 1, 0)
display(sample_sub.head(10))

# 提出ファイルを出力
sample_sub.to_csv("submission.csv", index=False)

In [None]:
# 同乗した家族の人数 = 兄弟・配偶者の人数 + 両親・子供の人数 + 本人
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

In [None]:
# FamilySizeと生存率の関係を可視化
sns.countplot(
    x='FamilySize',
    hue='Survived'
    , data=train_df
    )
plt.legend(title='Survived' ,loc='upper right')
plt.show()

In [None]:
display(train_df['FamilySize'].value_counts(ascending=False,normalize=True))