### 基本ライブラリの読み込み

In [None]:
# ライブラリの読み込み
import numpy as np
import pandas as pd

### trainデータ

In [None]:
# 1-1 trainデータフレームの読み込み
train_df = pd.read_csv("train.tsv", sep="\t")
train_df.head()

In [None]:
# 1-2 trainデータを特徴量のみにする
trainX = train_df.iloc[:, 2:]
trainX.head()

In [None]:
# 1-3 行数列数の確認
trainX.shape

In [None]:
# 1-4 基本情報の確認
trainX.info()

In [None]:
# 1-5 欠損値チェック
trainX.isnull().sum()

In [None]:
# 1-6 "stalk-root"カラムのデータ種別ごと個数の確認
trainX["stalk-root"].value_counts()
# ？というデータが含まれていることを気にかけながら先へ

In [None]:
# 1-7 "stalk-root"カラムの頭確認
trainX["stalk-root"].head()

In [None]:
# 2-1 文字列を数値に変換
# 正解ラベルを数値に変換するための変換表をdictionaryに登録
class_mapping_x = { 'a':2, 'b':3, 'c':4,'d':5, 'e':6, 'f':7, 'g':8, 'h':9, 'i':10, 'j':11, 'k':12, 'l':13, 'm':14, 'n':15, 'o':16, 'p':17,'q':18, 'r':19, 's':20, 't':21, 'u':22, 'v':23, 'w':24,'x':25, 'y':26, 'z':27} # ディクショナリ機能を使用し、正解ラベルを数値に紐付けして、変数class_mappingに代入
trainX = trainX.copy()

# loc関数で'該当'列のみを選択して、変数class_mappingをmap関数で適用する
trainX.loc[:,"cap-shape"] = trainX["cap-shape"].map(class_mapping_x)
trainX.loc[:,"cap-surface"] = trainX["cap-surface"].map(class_mapping_x)
trainX.loc[:,"cap-color"] = trainX["cap-color"].map(class_mapping_x)
trainX.loc[:,"bruises"] = trainX["bruises"].map(class_mapping_x)
trainX.loc[:,"odor"] = trainX["odor"].map(class_mapping_x)
trainX.loc[:,"gill-attachment"] = trainX["gill-attachment"].map(class_mapping_x)
trainX.loc[:,"gill-spacing"] = trainX["gill-spacing"].map(class_mapping_x)
trainX.loc[:,"gill-size"] = trainX["gill-size"].map(class_mapping_x)
trainX.loc[:,"gill-color"] = trainX["gill-color"].map(class_mapping_x)
trainX.loc[:,"stalk-shape"] = trainX["stalk-shape"].map(class_mapping_x)
trainX.loc[:,"stalk-root"] = trainX["stalk-root"].map(class_mapping_x)
trainX.loc[:,"stalk-surface-above-ring"] = trainX["stalk-surface-above-ring"].map(class_mapping_x)
trainX.loc[:,"stalk-surface-below-ring"] = trainX["stalk-surface-below-ring"].map(class_mapping_x)
trainX.loc[:,"stalk-color-above-ring"] = trainX["stalk-color-above-ring"].map(class_mapping_x)
trainX.loc[:,"stalk-color-below-ring"] = trainX["stalk-color-below-ring"].map(class_mapping_x)
trainX.loc[:,"veil-type"] = trainX["veil-type"].map(class_mapping_x)
trainX.loc[:,"veil-color"] = trainX["veil-color"].map(class_mapping_x)
trainX.loc[:,"ring-number"] = trainX["ring-number"].map(class_mapping_x)
trainX.loc[:,"ring-type"] = trainX["ring-type"].map(class_mapping_x)
trainX.loc[:,"spore-print-color"] = trainX["spore-print-color"].map(class_mapping_x)
trainX.loc[:,"population"] = trainX["population"].map(class_mapping_x)
trainX.loc[:,"habitat"] = trainX["habitat"].map(class_mapping_x)

In [None]:
# 2-2 結果確認
display(trainX.head())

In [None]:
# 2-3 欠損値が残っていないか確認
trainX.isnull().sum()

In [None]:
# 2-3 "stalk-root"カラムのデータバリエーションごとの個数を確認
trainX["stalk-root"].value_counts()

In [None]:
# 2-4 "stalk-root"カラムの頭確認
trainX["stalk-root"].head()

In [None]:
# 2-5 imputerを使用して欠損値を平均値で置き換える
from sklearn.preprocessing import Imputer

# インピュータークラスの実体化
# 欠損値'NaN'を平均値(mean)で置き換える,処理は列方向(axis=0)で行う.
imp = Imputer(missing_values='NaN',
              strategy='mean',
              axis=0)

# 各特徴量の平均値を学習
imp.fit(trainX)

# 学習済みのImputerを適用し, X_oheの欠損値を置き換える.
trainX_new_columns = trainX.columns.values
trainX2 = pd.DataFrame(imp.transform(trainX),
                     columns=trainX_new_columns)

# 結果表示
trainX2.head()

In [None]:
# 2-6 "stalk-root"カラムの欠損値を確認
trainX2["stalk-root"].isnull().sum()

In [None]:
# 2-7 "stalk-root"カラムのデータバリエーションごとの個数を確認
trainX2["stalk-root"].value_counts()

In [None]:
# 3-1 目的関数のみを切り出してyに代入する
y = train_df.iloc[:, [1]]
y.head()

In [None]:
# 3-2 文字列を数値に変換
# 正解ラベルの数値変換
class_mapping_y = {'p':1, 'e':0} # ディクショナリ機能を使用し、正解ラベルを数値に紐付けして、変数class_mappingに代入
y = y.copy()

# loc関数で'loan_Status'列のみを選択して、変数class_mappingをmap関数で適用する
y.loc[:,'Y'] = y['Y'].map(class_mapping_y)

display(trainX2.join(y).head())

### testデータ

In [None]:
# 4-1 testデータフレームの読み込み
test_df = pd.read_csv("test.tsv" ,sep="\t")
test_df.head()

In [None]:
# 4-2 testデータを特徴量のみにする
testX = test_df.iloc[:, 1:]
testX.head()

In [None]:
# 4-3 行数列数の確認
testX.shape

In [None]:
# 4-4 基本情報の確認
testX.info()

In [None]:
# 4-5 欠損値チェック
testX.isnull().sum()

In [None]:
# 4-6 "stalk-root"カラムのデータ種別ごと個数の確認
testX["stalk-root"].value_counts()
# ？というデータが含まれていることを気にかけながら先へ

In [None]:
# 4-7 "stalk-root"カラムの頭確認
testX["stalk-root"].head()

In [None]:
# 5-1 文字列を数値に変換
# 正解ラベルを数値に変換するための変換表をdictionaryに登録
class_mapping_x = { 'a':2, 'b':3, 'c':4,'d':5, 'e':6, 'f':7, 'g':8, 'h':9, 'i':10, 'j':11, 'k':12, 'l':13, 'm':14, 'n':15, 'o':16, 'p':17,'q':18, 'r':19, 's':20, 't':21, 'u':22, 'v':23, 'w':24,'x':25, 'y':26, 'z':27} # ディクショナリ機能を使用し、正解ラベルを数値に紐付けして、変数class_mapping_xに代入
testX = testX.copy()

# loc関数で'該当'列のみを選択して、変数class_mappingをmap関数で適用する
testX.loc[:,"cap-shape"] = testX["cap-shape"].map(class_mapping_x)
testX.loc[:,"cap-surface"] = testX["cap-surface"].map(class_mapping_x)
testX.loc[:,"cap-color"] = testX["cap-color"].map(class_mapping_x)
testX.loc[:,"bruises"] = testX["bruises"].map(class_mapping_x)
testX.loc[:,"odor"] = testX["odor"].map(class_mapping_x)
testX.loc[:,"gill-attachment"] = testX["gill-attachment"].map(class_mapping_x)
testX.loc[:,"gill-spacing"] = testX["gill-spacing"].map(class_mapping_x)
testX.loc[:,"gill-size"] = testX["gill-size"].map(class_mapping_x)
testX.loc[:,"gill-color"] = testX["gill-color"].map(class_mapping_x)
testX.loc[:,"stalk-shape"] = testX["stalk-shape"].map(class_mapping_x)
testX.loc[:,"stalk-root"] = testX["stalk-root"].map(class_mapping_x)
testX.loc[:,"stalk-surface-above-ring"] = testX["stalk-surface-above-ring"].map(class_mapping_x)
testX.loc[:,"stalk-surface-below-ring"] = testX["stalk-surface-below-ring"].map(class_mapping_x)
testX.loc[:,"stalk-color-above-ring"] = testX["stalk-color-above-ring"].map(class_mapping_x)
testX.loc[:,"stalk-color-below-ring"] = testX["stalk-color-below-ring"].map(class_mapping_x)
testX.loc[:,"veil-type"] = testX["veil-type"].map(class_mapping_x)
testX.loc[:,"veil-color"] = testX["veil-color"].map(class_mapping_x)
testX.loc[:,"ring-number"] = testX["ring-number"].map(class_mapping_x)
testX.loc[:,"ring-type"] = testX["ring-type"].map(class_mapping_x)
testX.loc[:,"spore-print-color"] = testX["spore-print-color"].map(class_mapping_x)
testX.loc[:,"population"] = testX["population"].map(class_mapping_x)
testX.loc[:,"habitat"] = testX["habitat"].map(class_mapping_x)

In [None]:
# 5-2 結果確認
display(testX.head())

In [None]:
# 5-3 欠損値が残っていないか確認
testX.isnull().sum()

In [None]:
# 5-4 "stalk-root"カラムのデータバリエーションごとの個数を確認
testX["stalk-root"].value_counts()

In [None]:
# 5-5 "stalk-root"カラムの頭確認
testX["stalk-root"].head()

In [None]:
# 5-6 imputerを使用して欠損値を平均値で置き換える

# インピュータークラスの実体化
# 欠損値'NaN'を平均値(mean)で置き換える,処理は列方向(axis=0)で行う.
# imp = Imputer(missing_values='NaN',
#               strategy='mean',
#               axis=0)

# 各特徴量の平均値を学習
imp.fit(testX)

# 学習済みのImputerを適用し, X_oheの欠損値を置き換える.
testX_new_columns = testX.columns.values
testX2 = pd.DataFrame(imp.transform(testX),
                     columns=testX_new_columns)

# 結果表示
testX2.head()

In [None]:
# 5-7 "stalk-root"カラムの欠損値を確認
testX2["stalk-root"].isnull().sum()

In [None]:
# 5-8 "stalk-root"カラムのデータバリエーションごとの個数を確認
testX2["stalk-root"].value_counts()

### 交差検証

In [None]:
# 6-1 trainXを交差検証
from sklearn.model_selection import train_test_split
# Holdout
X_train,X_test,y_train,y_test = train_test_split(trainX2,
                                                 y,
                                                 test_size=0.30,
                                                 random_state=1)

In [None]:
# 6-2 整形
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [None]:
# 6-3 import libraries
from sklearn.preprocessing import StandardScaler # 《標準化》スタンダードスケラー
from sklearn.neighbors import KNeighborsClassifier # 《分類》K近傍法
from sklearn.linear_model import LogisticRegression # 《分類》ロジスティック回帰
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier #  《分類》ランダム森、勾配ブースティング
from sklearn.neural_network import MLPClassifier # 《分類》多層ニューラルネットワーク
from sklearn.pipeline import Pipeline # 《一本化》パイプライン

In [None]:
# 6-4 set pipelines for two different algorithms
pipe_KNC = Pipeline([('scl',StandardScaler()),
                     ('est',KNeighborsClassifier())])
pipe_LR = Pipeline([('scl',StandardScaler()),
                          ('est',LogisticRegression(random_state=1))])
pipe_RF = Pipeline([('scl',StandardScaler()),
                    ('est',RandomForestClassifier(random_state=1))])
pipe_GB = Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))])
pipe_MLP = Pipeline([('scl',StandardScaler()),
                    ('est',MLPClassifier(hidden_layer_sizes=(5,3),max_iter=500,random_state=1))])

In [None]:
# 6-5 optimize the parameters of each algorithms
pipe_KNC.fit(X_train, y_train)
pipe_LR.fit(X_train, y_train)
pipe_RF.fit(X_train, y_train)
pipe_GB.fit(X_train, y_train)
pipe_MLP.fit(X_train, y_train)

In [39]:
#6-6 f1スコアで比較
from sklearn.metrics import f1_score

print('pipe_KNC_Train:%.3f'% f1_score(y_train,
                                          pipe_KNC.predict(X_train)))
print('pipe_KNC_Test:%.3f' % f1_score(y_test,
                                          pipe_KNC.predict(X_test)))

print('pipe_LR_Train:%.3f'% f1_score(y_train,
                                          pipe_LR.predict(X_train)))
print('pipe_LR_Test:%.3f' % f1_score(y_test,
                                          pipe_LR.predict(X_test)))

print('pipe_RF_Train:%.3f'% f1_score(y_train,
                                          pipe_RF.predict(X_train)))
print('pipe_RF_Test:%.3f' % f1_score(y_test,
                                          pipe_RF.predict(X_test)))

print('pipe_GB_Train:%.3f'% f1_score(y_train,
                                          pipe_GB.predict(X_train)))
print('pipe_GB_Test:%.3f' % f1_score(y_test,
                                          pipe_GB.predict(X_test)))

print('pipe_MLP_Train:%.3f'% f1_score(y_train,
                                          pipe_MLP.predict(X_train)))
print('pipe_MLP_Test:%.3f' % f1_score(y_test,
                                          pipe_MLP.predict(X_test)))

pipe_KNC_Train:1.000
pipe_KNC_Test:1.000
pipe_LR_Train:0.959
pipe_LR_Test:0.962
pipe_RF_Train:1.000
pipe_RF_Test:1.000
pipe_GB_Train:1.000
pipe_GB_Test:1.000
pipe_MLP_Train:1.000
pipe_MLP_Test:1.000


In [None]:
# 6-7 アルゴリズムの採用
# 'RF'(random-forest)を採用

# predictで算出した値を変数predに代入する
pred = pipe_RF.predict(testX2)
pred

### 提出用データ作成

In [None]:
# 7-1 提出用csvファイルの読み込み
sample = pd.read_csv("sample_submit.csv",header=None)
sample.head()

In [None]:
# 7-2 行数列数の確認
sample.shape

In [None]:
# 7-3 array形で格納されているpredをDataFrame型に変換
pred_df = pd.DataFrame(pred)
pred_df.head()

In [None]:
# 7-4 数値を文字列に変換
# 提出用データは'e'と'p'でデータを入力する必要があるため数値から文字列に変換する。
# 正解ラベルの数値数値変換
class_mapping_y2 = {1:'p', 0:'e'} # ディクショナリ機能を使用し、正解ラベルを数値に紐付けして、変数class_mappingに代入
pred_df = pred_df.copy()

# loc関数で'loan_Status'列のみを選択して、変数class_mappingをmap関数で適用する
pred_df.iloc[:,0] = pred_df[0].map(class_mapping_y2)

display(pred_df.head())

In [None]:
# 7-5 提出用csvの正解カラムにpredを代入
sample[1] = pred_df
sample.head()

### csvファイルの書き出し

In [None]:
# 7-6 提出用データをcsv形式で出力
sample.to_csv("submit0.csv",index=None,header=None)