In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 900)

In [None]:
# データの読み込み（修正版）
data_train = pd.read_csv('/kaggle/input/titanic/train.csv')
data_test = pd.read_csv('/kaggle/input/titanic/test.csv')
data_gender_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
data_train.head(10)

In [None]:
data_test.head(5)

In [None]:
data_gender_submission.head()

In [None]:
data_train.describe()

In [None]:
# 運賃を生存可否で色分けして表示
# multiple='stack' は「積み上げ」表示にする設定です（重なって見えないのを防ぐため）
plt.figure(figsize=(10, 6))
sns.histplot(data=data_train, x='Fare', hue='Survived', bins=30, multiple='stack')
plt.title('Fare Distribution by Survival')
plt.show()

In [None]:
# import pandas_profiling
import ydata_profiling

In [None]:
# data_train.profile_report()

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=data_train)

In [None]:
sns.countplot(x='Sex', hue='Survived', data=data_train)

In [None]:
sns.countplot(x='Age', hue='Survived', data=data_train)

In [None]:
plt.hist(data_train.loc[data_train['Survived']==0,'Age'].dropna(), bins=30, alpha=0.5,label='0')
plt.hist(data_train.loc[data_train['Survived']==1,'Age'].dropna(), bins=30, alpha=0.5,label='1')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived')

In [None]:
sns.countplot(x='SibSp',hue ='Survived', data=data_train )

In [None]:
data_test.head()

In [None]:
data_gender_submission.head()

In [None]:
plt.hist(data_train.loc[data_train['Survived']==0,'Fare'].dropna(),range=(0,250), bins=20, alpha=0.5,label='0')
plt.hist(data_train.loc[data_train['Survived']==1,'Fare'].dropna(),range=(0,250),bins=20, alpha=0.5,label='1')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived')

In [None]:
data_all = pd.concat([data_train, data_test], sort=False)
data_all

In [None]:
#各特徴量の欠損値の数を確認
#テストデータは「Survived」がないため欠損値としてカウントされている
data_all.isnull().sum()

In [None]:
data_all['Sex_male'] = data_all['Sex'].replace(['male','female'],[1,0])
data_all['Sex_female'] = data_all['Sex'].replace(['male','female'],[0,1])
data_all['Sex'] = data_all['Sex'].replace(['male', 'female'], [0, 1])
data_all.head()

In [None]:
#ParchとSibSpを合わせた特徴量を作成
data_all['Families'] = data_all['Parch'] + data_all['SibSp'] + 1
data_train['Families'] = data_all['Families'][:len(data_train)]
data_test['Families'] = data_all['Families'][len(data_train):]
sns.countplot(x='Families', data=data_train, hue='Survived')

In [None]:
#1がずば抜けて多いため、別の特徴量として抜き出す
data_all['Alone'] = 0
data_all.loc[data_all['Families'] == 1, 'Alone'] = 1
data_all.head()

In [None]:
ave = data_all['Age'].mean()
std = data_all['Age'].std()
#「平均±標準偏差」間の値からランダムで選ばれた値で補完
np.random.seed(0)
data_all.loc[:, 'Age'] = data_all['Age'].fillna(
    np.random.randint(ave - std, ave + std)
)

data_all.head()
#乱数なので再現性がないという欠点がある
#年齢は若い人から高齢者まで幅広いため、中央値で補完するのも手
# data_all['Age'].fillna(data_all['Age'].median(), inplace=True)

In [None]:
#ひとつ前のデータと同様にdata_all['Age'].isnull()に対して、平均年齢±標準偏差を補完するのだが、
# nullを埋めるための値は、都度ランダムに出力される。

# ave = data_all['Age'].mean()
# std = data_all['Age'].std()
# #「平均±標準偏差」間の値からランダムで選ばれた値で補完
# np.random.seed(42)
# mask = data_all['Age'].isnull()
# n_missing = mask.sum()
# rand_values = np.random.randint(ave - std, ave + std, size=n_missing)
# data_all.loc[mask, 'Age'] = rand_values

# data_all.head()


In [None]:
mask = data_all['Age'].isnull()
n_missing = mask.sum()
print(mask)
print(n_missing)

In [None]:
#Embarkedの欠損値を補完し、数値に置き換える
data_all.loc[:, 'Embarked'] = data_all['Embarked'].fillna('S')
data_all.loc[:, 'Embarked'] = data_all['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)

In [None]:
#影響の少なそうな特徴量は一旦削除する
drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin']
data_all = data_all.drop(drop_columns, axis=1)

In [None]:
data_all.head()

In [None]:
#再確認
data_all.isnull().sum()

In [None]:
#Fareの欠損値を平均値で補完する
data_all.loc[:, 'Fare'] = data_all['Fare'].fillna(data_all['Fare'].mean())


In [None]:
data_all['Sex'].replace(['male','female'], [0,1], inplace=True)
data_all.head()

In [None]:
# 空欄(NaN)も含めてカウントする
data_all['Embarked'].value_counts(dropna=False)

In [None]:
#Embarkedの欠損値を補完し、数値に置き換える
data_all['Embarked'].fillna('S', inplace=True)
data_all['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
data_all.head()

In [None]:
#再確認
data_all.isnull().sum()

In [None]:
#結合したデータを、再度教師データとテストデータに分離
data_train = data_all[:len(data_train)]
data_test = data_all[len(data_train):]

In [None]:
#アルゴリズムに投入するため、特徴量と目的変数を分離
y_data_train = data_train['Survived']
X_data_train = data_train.drop('Survived', axis=1)
X_data_test = data_test.drop('Survived', axis=1)

In [None]:
X_data_test.head()

In [None]:
#決定木アルゴリズムをインポート
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [None]:
#教師データの学習
clf.fit(X_data_train, y_data_train)

In [None]:
#作成したモデルで予測
y_data_pred = clf.predict(X_data_test)

In [None]:
y_data_pred

In [None]:
submit = data_gender_submission.copy()  # 元データ保護
submit.loc[:, 'Survived'] = y_data_pred.astype(int)
submit.head()

In [None]:
import os
print(os.listdir("/kaggle/working"))

In [None]:
submit.to_csv('engineering_submit.csv', index=False)