In [2]:
#pandas & numpy
import pandas as pd
import numpy as np

#Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

#the classification algorithms
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#for parameter optimization
from sklearn.model_selection import GridSearchCV

#for evaluation 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import metrics

#ignore all warnings
import warnings
warnings.filterwarnings("ignore")

#load the data
data = pd.read_csv('titanicAssignment.csv')


In [7]:
# 欠損の確認
def deficiency_table(df):
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum() / len(df)
    deficiency_table = pd.concat([null_val, percent], axis=1)
    deficiency_table_ren_columns = deficiency_table.rename(columns={0: "欠損数", 1: "%"})
    return deficiency_table_ren_columns

deficiency_table(data)

Unnamed: 0,欠損数,%
pclass,0,0.0
survived,0,0.0
sex,0,0.0
age,0,0.0
sibsp,0,0.0
parch,0,0.0
fare,0,0.0
embarked,0,0.0


In [3]:
# load the data
data = pd.read_csv("titanicAssignment.csv")

# カテゴリカルデータをダミー変数化
# sex, embarked
ohe = OneHotEncoder(categories="auto")
feature_arr = ohe.fit_transform(data[["sex", "embarked"]]).toarray()

# input, outputのデータフレームを作成
feature_labels = list(ohe.categories_[0]) + list(ohe.categories_[1])
Y = data["survived"]
data.drop(labels=["survived", "sex", "embarked"], axis=1, inplace=True)
X = pd.concat([data, pd.DataFrame(feature_arr, columns=feature_labels)], axis=1)

# 正規化
minMaxScaler = MinMaxScaler()
X.loc[:, ["pclass"]] = minMaxScaler.fit_transform(X.loc[:, ["pclass"]])

# 標準化
stdScaler = StandardScaler()
X.loc[:, ["age", "sibsp", "parch", "fare"]] = stdScaler.fit_transform(
    X.loc[:, ["age", "sibsp", "parch", "fare"]]
)


In [4]:
# データの分割
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
Y_train_df = pd.DataFrame(Y_train).reset_index(drop=True).rename(columns={"survived": "True"})
Y_test_df = pd.DataFrame(Y_test).reset_index(drop=True).rename(columns={"survived": "True"})


In [5]:
# モデルの宣言
# SVM
svmCLF = SVC(C=0.1, kernel="linear")

# # 交差検証
# # SVM
# scores = cross_val_score(svmCLF, X_train, Y_train, scoring="f1_macro", cv=5)
# print("SVM Score")
# print(scores)
# print("SVM F1 Macro: %.2f%% (%.2f%%)" % (scores.mean() * 100, scores.std() * 100))

# # 学習
svmCLF.fit(X_train, Y_train)

# 学習データの予測
svm_pred = pd.DataFrame(svmCLF.predict(X_train), columns={"svm"})
output = pd.concat([Y_train_df, svm_pred], axis=1)
svm_true = output[output["True"] == output["svm"]]
print("Train Data Accuracy: %.2f%%" % (len(svm_true) / len(output) * 100))

# テストデータの予測
svm_pred = pd.DataFrame(svmCLF.predict(X_test), columns={"svm"})
output = pd.concat([Y_test_df, svm_pred], axis=1)
svm_true = output[output["True"] == output["svm"]]
print("Test Data Accuracy: %.2f%%" % (len(svm_true) / len(output) * 100))

Train Data Accuracy: 77.35%
Test Data Accuracy: 77.69%


In [17]:
# モデルの宣言
#NaiveBayes
NBCLF= GaussianNB()
# # 交差検証
# #Naive Bayes
# scores = cross_val_score(NBCLF, X_train,Y_train, scoring="f1_macro", cv=5)
# print("NB Score")
# print(scores)
# print("NB F1 Macro: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

NBCLF.fit(X_train, Y_train)

# 学習データの予測
nb_pred = pd.DataFrame(NBCLF.predict(X_train), columns={"nb"})
output = pd.concat([Y_train_df, nb_pred], axis=1)
nb_true = output[output["True"] == output["nb"]]
print("Train Data Accuracy: %.2f%%" % (len(nb_true) / len(output) * 100))

# テストデータの予測
nb_pred = pd.DataFrame(NBCLF.predict(X_test), columns={"nb"})
output = pd.concat([Y_test_df, nb_pred], axis=1)
nb_true = output[output["True"] == output["nb"]]
print("Test Data Accuracy: %.2f%%" % (len(nb_true) / len(output) * 100))

Train Data Accuracy: 76.53%
Test Data Accuracy: 76.86%


In [36]:
def RFC(max_depth, n_estimators):
    # モデルの宣言
    # Random Forest
    RFCLF = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    # # 交差検証
    # #Random Forest
    # scores = cross_val_score(RFCLF,X_train,Y_train, scoring="f1_macro", cv=5)
    # print("RF Score")
    # print(scores)
    # print("RF F1 Macro: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

    RFCLF.fit(X_train, Y_train)

    # 学習データの推定
    rf_pred = pd.DataFrame(RFCLF.predict(X_train), columns={"rf"})
    output = pd.concat([Y_train_df, rf_pred], axis=1)
    rf_true = output[output["True"] == output["rf"]]
    train_acc = len(rf_true) / len(output) * 100

    # テストデータの推定
    rf_pred = pd.DataFrame(RFCLF.predict(X_test), columns={"rf"})
    output = pd.concat([Y_test_df, rf_pred], axis=1)
    rf_true = output[output["True"] == output["rf"]]
    test_acc = len(rf_true) / len(output) * 100

    return train_acc, test_acc

with open("RFC_result.txt", "w") as f:
    for max_depth in [3, 5, 8, 13, 21, 34, 55, 89, 100]:
        for n_estimators in [3, 5, 8, 13, 21, 34, 55, 89, 100]:
            train_acc, test_acc = RFC(max_depth, n_estimators)
            f.write("max_depth={}, n_estimators={}, {:.2f}, {:.2f}\n".format(max_depth, n_estimators, train_acc, test_acc))
