In [13]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_path = "C:/Users/weiso131/Desktop/sklearn/titanic_disaster/titanic/train.csv"

titanic_df = pd.read_csv(titanic_path)

titanic_df = titanic_df.sample(n=len(titanic_df))

In [14]:
def data_pre_process(keep : list, df : pd.DataFrame):
    new_df = df[keep]
    new_df = new_df.dropna()
    for i in new_df.index:
        if (new_df["Sex"].get(i) == "female"): 
            new_df.at[i ,"Sex"] = 0
        else: 
            new_df.at[i ,"Sex"] = 1

    return new_df

In [15]:
keep = ["Survived", "Pclass", "Sex", "SibSp", "Parch",  "Fare"]
keep_Experimental = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Age"]
titanic_steady_df = data_pre_process(keep, titanic_df)
titanic_Experimental_df = data_pre_process(keep_Experimental, titanic_df)


In [16]:
X_steady = titanic_steady_df.drop("Survived", axis=1).values
y_steady = titanic_steady_df["Survived"].values



X_Experimental = titanic_Experimental_df.drop("Survived", axis=1).values
y_Experimental = titanic_Experimental_df["Survived"].values

In [17]:
def cut_block(X : list, y : list, cut=5):
    """
    return X_block, y_block
    """
    n_20 = int(len(X) / cut)
    X_block = []
    y_block = []
    for i in range(cut):
        X_block.append(X[i*n_20:(i+1)*n_20])
        y_block.append(y[i*n_20:(i+1)*n_20])
    return X_block, y_block

def assemble_block(X_block : list, y_block : list, test_i : int, cut=5):
    """
    return:
    x_train, y_train, x_test, y_test
    """
    x_test, y_test = X_block[test_i], y_block[test_i]
    x_train = []
    y_train = []
    for i in range(cut):
        if (i == test_i): continue
        x_train.extend(X_block[i])
        y_train.extend(y_block[i])

    return x_train, y_train, x_test, y_test


In [18]:
def train(x_train : list, y_train : list):
    """
    return rf_clf
    """
    # 創建隨機森林分類器實例
    rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)

    # 訓練分類器
    rf_clf.fit(x_train, y_train)

    

    return rf_clf
def test(x_train : list, y_train : list, x_test : list, y_test : list, rf_clf):
    prediction_train = rf_clf.predict(x_train)
    print(f"train_acc: {accuracy_score(prediction_train, y_train)}")
    prediction_test = rf_clf.predict(x_test)
    print(f"test_acc: {accuracy_score(prediction_test, y_test)}")

    return accuracy_score(prediction_train, y_train), accuracy_score(prediction_test, y_test)


In [19]:
cut = 10

X_steady_block, y_steady_block = cut_block(X_steady, y_steady, cut=cut)
X_Experimental_block, y_Experimental_block = cut_block(X_Experimental, y_Experimental, cut=cut)  



avg_steady_train = 0
avg_steady_test = 0
avg_Experimental_train = 0
avg_Experimental_test = 0



for i in range(cut):
    x_steady_train, y_steady_train, x_steady_test, y_steady_test = \
        assemble_block(X_steady_block, y_steady_block, i, cut=cut)
    
    x_Experimental_train, y_Experimental_train, x_Experimental_test, y_Experimental_test = \
        assemble_block(X_Experimental_block, y_Experimental_block, i, cut=cut)

    
    rf_clf_steady = train(x_steady_train, y_steady_train)
    print("steady:")
    result = test(x_steady_train, y_steady_train, x_steady_test, y_steady_test, rf_clf_steady)

    avg_steady_train += result[0]
    avg_steady_test += result[1]

    
    rf_clf_Experimental = train(x_Experimental_train, y_Experimental_train)
    print("Experimental:")
    result = test(x_Experimental_train, y_Experimental_train, x_Experimental_test, y_Experimental_test, rf_clf_Experimental)

    avg_Experimental_train += result[0]
    avg_Experimental_test += result[1]

steady:
train_acc: 0.9238451935081149
test_acc: 0.8314606741573034
Experimental:
train_acc: 0.9358372456964006
test_acc: 0.7746478873239436
steady:
train_acc: 0.9300873907615481
test_acc: 0.7078651685393258
Experimental:
train_acc: 0.9342723004694836
test_acc: 0.8028169014084507
steady:
train_acc: 0.9225967540574282
test_acc: 0.797752808988764
Experimental:
train_acc: 0.9358372456964006
test_acc: 0.7746478873239436
steady:
train_acc: 0.920099875156055
test_acc: 0.8764044943820225
Experimental:
train_acc: 0.9327073552425665
test_acc: 0.8169014084507042
steady:
train_acc: 0.9263420724094882
test_acc: 0.7640449438202247
Experimental:
train_acc: 0.9358372456964006
test_acc: 0.7746478873239436
steady:
train_acc: 0.9225967540574282
test_acc: 0.8426966292134831
Experimental:
train_acc: 0.9342723004694836
test_acc: 0.8732394366197183
steady:
train_acc: 0.9213483146067416
test_acc: 0.7865168539325843
Experimental:
train_acc: 0.9327073552425665
test_acc: 0.8450704225352113
steady:
train_acc: 0.9

In [20]:
print(f"avg steady train:{avg_steady_train / cut}, avg steady test:{avg_steady_test / cut}")
print(f"avg Experimental train:{avg_Experimental_train / cut}, avg Experimental test:{avg_Experimental_test / cut}")

avg steady train:0.9243445692883896, avg steady test:0.7910112359550562
avg Experimental train:0.9353677621283257, avg Experimental test:0.7929577464788733


In [24]:
keep_normal = ["Survived", "Pclass", "Sex", "SibSp", "Parch",  "Fare", "Age"]
keep_lost_fare = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Age"]
keep_lost_age = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare"]

titanic_normal_df = data_pre_process(keep_normal, titanic_df)
titanic_lost_fare_df = data_pre_process(keep_lost_fare, titanic_df)
titanic_lost_age_df = data_pre_process(keep_lost_age, titanic_df)

X_normal = titanic_normal_df.drop("Survived", axis=1).values
y_normal = titanic_normal_df["Survived"].values

X_lost_fare = titanic_lost_fare_df.drop("Survived", axis=1).values
y_lost_fare = titanic_lost_fare_df["Survived"].values

X_lost_age = titanic_lost_age_df.drop("Survived", axis=1).values
y_lost_age = titanic_lost_age_df["Survived"].values



In [28]:
final_normal_model = train(X_normal, y_normal)
final_lost_fare_model = train(X_lost_fare, y_lost_fare)
final_lost_age_model = train(X_lost_age, y_lost_age)

In [31]:
test_path = "C:/Users/weiso131/Desktop/sklearn/titanic_disaster/titanic/test.csv"


keep_test_normal = [ "Pclass", "Sex", "SibSp", "Parch",  "Fare", "Age"]
keep_test_lost_fare = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
keep_test_lost_age = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]


titanic_test_df = pd.read_csv(test_path)

titanic_test_normal = data_pre_process(keep_test_normal, titanic_test_df).dropna()
titanic_test_lost_fare = data_pre_process(keep_test_lost_fare, titanic_test_df).dropna()
titanic_test_lost_age = data_pre_process(keep_test_lost_age, titanic_test_df).dropna()





In [32]:
prediction_submission_normal = final_normal_model.predict(titanic_test_normal.values)
prediction_submission_lost_fare = final_lost_fare_model.predict(titanic_test_lost_fare.values)
prediction_submission_lost_age = final_lost_age_model.predict(titanic_test_lost_age.values)


In [39]:
all_prediction = np.zeros(418)
all_predict_count = np.zeros(418)

for i in range(len(titanic_test_normal)):
    index = titanic_test_normal.index[i]
    all_prediction[index] += prediction_submission_normal[i] * 2
    all_predict_count[index] += 2

for i in range(len(titanic_test_lost_fare)):
    index = titanic_test_lost_fare.index[i]
    all_prediction[index] += prediction_submission_lost_fare[i]
    all_predict_count[index] += 1

for i in range(len(titanic_test_lost_age)):
    index = titanic_test_lost_age.index[i]
    all_prediction[index] += prediction_submission_lost_age[i]
    all_predict_count[index] += 1

all_prediction = all_prediction / all_predict_count

print(all_prediction)
print(all_predict_count)

[0.   0.25 0.75 0.75 0.25 0.   0.   0.   0.75 0.   0.   0.   1.   0.
 1.   1.   0.   0.5  0.   0.25 0.   0.75 1.   0.   1.   0.   1.   0.5
 0.75 0.   0.   0.   1.   0.   0.75 0.   0.   0.   0.   1.   0.25 1.
 0.   1.   1.   0.   0.25 0.   1.   0.75 0.75 0.   1.   1.   0.   0.
 0.   0.   0.   1.   0.   0.   0.   1.   0.75 1.   0.75 0.   0.   1.
 0.75 0.   0.25 0.75 1.   0.25 0.   1.   0.   0.75 1.   0.   0.25 0.
 0.   0.   1.   0.5  1.   1.   0.   0.   0.75 0.   0.5  0.   1.   0.5
 0.   0.   1.   0.   0.   0.   0.5  0.   0.   0.   0.   0.   0.   0.
 1.   1.   1.   0.   0.   1.   0.5  1.   1.   0.   1.   0.   0.   0.25
 0.   1.   0.   0.   0.75 0.25 0.   0.   0.   0.   0.   0.   0.25 0.
 0.   1.   0.   0.   1.   0.   0.   0.   1.   0.25 1.   0.   0.   0.75
 0.   0.   1.   0.75 1.   1.   1.   1.   1.   0.   0.   0.25 0.25 0.
 1.   0.25 0.   0.25 0.   0.   0.   1.   1.   0.25 1.   1.   0.   0.25
 1.   0.   1.   0.   1.   0.   0.   0.   0.   0.   0.75 0.25 1.   0.
 0.75 0.   0.   0.   1.   

In [40]:
submission_path = "C:/Users/weiso131/Desktop/sklearn/titanic_disaster/titanic/submission.csv"

submission_df = pd.read_csv(submission_path)

for i in range(len(submission_df)):
    index = submission_df.index[i]
    
    if (all_prediction[i] < 0.5): all_prediction[i] = 0
    else: all_prediction[i] = 1

    submission_df.at[index, "Survived"] = int(all_prediction[i])



In [43]:
print(submission_df)
submission_df.to_csv("submission_weiso.csv", index = False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         1
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
