In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from Modeling import *
from MyProcess import *

In [2]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')
df_all_raw = pd.concat([df_train_raw, df_test_raw], axis=0)

df_train = copy.copy(df_train_raw)
df_test = copy.copy(df_test_raw)
df_all = copy.copy(df_all_raw)

print(df_train.shape)
df_train.head(5)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(df_test.shape)
df_test.head(5)

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
print(df_test_raw.isnull().sum())
print()
print(df_train_raw.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
process_basic_df_train = myProcessor(df_train)
process_basic_df_test = myProcessor(df_test)
preprocessed_basic_df_train =  process_basic_df_train.preprocess_df()
preprocessed_basic_df_test =  process_basic_df_test.preprocess_df(_remove=False)

print(preprocessed_basic_df_train.shape)
preprocessed_basic_df_train

(712, 17)


Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,1,0,1,0,0.693558,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1,1,0,1,0,0.350080,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
886,0,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,1,0,0,0,-0.086274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,1,1,1,0,0,-0.086274,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# df_test.shape -> (418, 11)
print(preprocessed_basic_df_test.shape)
preprocessed_basic_df_test.head(5)

(418, 16)


Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,3,1,0,0,-0.497811,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3,0,1,0,-0.51266,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2,1,0,0,-0.464532,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,1,0,0,-0.482888,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,3,0,1,1,-0.417971,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [7]:
basic_y = preprocessed_basic_df_train['Survived']
basic_X = preprocessed_basic_df_train.drop(columns=['Survived'])


print(basic_X.shape)
basic_X

(712, 16)


Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,0,1,0,0.693558,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1,0,1,0,0.350080,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
886,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,0,0,0,-0.086274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,1,1,0,0,-0.086274,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
basic_X.columns

Index(['Pclass',    'Sex',  'SibSp',  'Parch',   'Fare',      'C',      'Q',
            'S',        1,        2,        3, 'Master',   'Miss',     'Mr',
          'Mrs',   'else'],
      dtype='object')

In [9]:
print(basic_X.shape)
print(preprocessed_basic_df_test.shape)
print(df_train_raw.shape)
print(df_test_raw.shape)

(712, 16)
(418, 16)
(891, 12)
(418, 11)


In [10]:
basic_X

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,0,1,0,0.693558,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1,0,1,0,0.350080,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
886,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,0,0,0,-0.086274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,1,1,0,0,-0.086274,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
preprocessed_basic_df_test

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,3,1,0,0,-0.497811,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3,0,1,0,-0.512660,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2,1,0,0,-0.464532,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,1,0,0,-0.482888,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,3,0,1,1,-0.417971,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,1,0,0,-0.493856,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
414,1,0,0,0,1.312180,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
415,3,1,0,0,-0.508183,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
416,3,1,0,0,-0.493856,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [12]:
basic_df_train_modeling = MyModelingProcess(_X = basic_X, _y= basic_y)

In [13]:
basic_df_train_modeling.DecisionTree()

모델 DecisionTree이 DecisionTree 파일로 저장되었습니다.
DecisionTree Best parameters:
{'classifier__criterion': 'gini', 'classifier__max_depth': 15, 'classifier__min_samples_split': 10}
0.8244755244755245


In [None]:
basic_df_train_modeling.DecisionTree_pca()

In [None]:
basic_df_train_modeling.DecisionTree_sbs()

In [15]:
basic_df_train_modeling.LogisticRegression()



모델 LogisticRegression이 LogisticRegression 파일로 저장되었습니다.
LogisticRegression grid search result: 
{'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
0.8033881611346401



150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yujin/anaconda

In [None]:
basic_df_train_modeling.LogisticRegression_pca()

In [None]:
basic_df_train_modeling.LogisticRegression_sbs()

In [16]:
basic_df_train_modeling.RandomForest()

모델 RandomForest이 RandomForest 파일로 저장되었습니다.
Best parameters with RandomForest:
{'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
0.8189205161036146


In [None]:
basic_df_train_modeling.RandomForest_pca()

In [None]:
basic_df_train_modeling.RandomForest_sbs()

In [17]:
basic_df_train_modeling.SVM()



모델 SVC_grid이 SVC_grid 파일로 저장되었습니다.
Best parameters with SVN:
{'classifier__C': 10, 'classifier__kernel': 'linear', 'classifier__max_iter': 2000}
0.8202107751403526




In [None]:
basic_df_train_modeling.SVM_pca()

In [None]:
basic_df_train_modeling.SVM_sbs()

In [18]:
basic_df_train_modeling.SVM_kernel()

모델 SVM_kernel이 SVC_kernel_grid 파일로 저장되었습니다.
SVM_kernel Best parameters:
{'classifier__C': 10, 'classifier__gamma': 0.01, 'classifier__kernel': 'rbf'}
0.8229981286319313


In [None]:
basic_df_train_modeling.SVM_kernel_pca()

In [None]:
basic_df_train_modeling.SVM_kernel_sbs()

In [19]:
basic_df_train_modeling.KNN()

모델 KNN이 KNN 파일로 저장되었습니다.
Best parameters for KNN:
{'classifier__n_neighbors': 7, 'classifier__p': 1, 'classifier__weights': 'uniform'}
0.8174923667881415


In [None]:
basic_df_train_modeling.KNN_pca()

In [None]:
basic_df_train_modeling.KNN_sbs()

In [20]:
basic_df_train_modeling.models

[('DecisionTree',
  Pipeline(steps=[('classifier',
                   DecisionTreeClassifier(max_depth=15, min_samples_split=10))]),
  0.8244755244755245),
 ('perceptron',
  Pipeline(steps=[('classifier', Perceptron())]),
  0.7738796414852753),
 ('LogisticRegression',
  Pipeline(steps=[('classifier',
                   LogisticRegression(C=1, max_iter=1000, penalty='l1',
                                      solver='saga'))]),
  0.8033881611346401),
 ('RandomForest',
  Pipeline(steps=[('classifier',
                   RandomForestClassifier(max_depth=10, min_samples_split=10,
                                          n_estimators=50))]),
  0.8189205161036146),
 ('SVC_grid',
  Pipeline(steps=[('classifier', SVC(C=10, kernel='linear', max_iter=2000))]),
  0.8202107751403526),
 ('SVM_kernel',
  Pipeline(steps=[('classifier', SVC(C=10, gamma=0.01))]),
  0.8229981286319313),
 ('KNN',
  Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=7, p=1))]),
  0.8174923667881415)]

In [21]:
basic_df_train_modeling.models.sort(key=lambda x: x[2], reverse=True)

In [22]:
basic_df_train_modeling.print_model_accuracies()

Model Accuracies:
Model: DecisionTree, Accuracy: 0.8245
Model: SVM_kernel, Accuracy: 0.8230
Model: SVC_grid, Accuracy: 0.8202
Model: RandomForest, Accuracy: 0.8189
Model: KNN, Accuracy: 0.8175
Model: LogisticRegression, Accuracy: 0.8034
Model: perceptron, Accuracy: 0.7739


In [23]:
N = 5
top_N_models = basic_df_train_modeling.get_top_n_models(N)
top_N_models

Model: DecisionTree, Accuracy: 0.8245
Model: SVM_kernel, Accuracy: 0.8230
Model: SVC_grid, Accuracy: 0.8202
Model: RandomForest, Accuracy: 0.8189
Model: KNN, Accuracy: 0.8175


[('DecisionTree',
  Pipeline(steps=[('classifier',
                   DecisionTreeClassifier(max_depth=15, min_samples_split=10))]),
  0.8244755244755245),
 ('SVM_kernel',
  Pipeline(steps=[('classifier', SVC(C=10, gamma=0.01))]),
  0.8229981286319313),
 ('SVC_grid',
  Pipeline(steps=[('classifier', SVC(C=10, kernel='linear', max_iter=2000))]),
  0.8202107751403526),
 ('RandomForest',
  Pipeline(steps=[('classifier',
                   RandomForestClassifier(max_depth=10, min_samples_split=10,
                                          n_estimators=50))]),
  0.8189205161036146),
 ('KNN',
  Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=7, p=1))]),
  0.8174923667881415)]

In [24]:
model_list = []
Dtree = basic_df_train_modeling.load_model("DecisionTree")
model_list.append(Dtree)

knn = basic_df_train_modeling.load_model("KNN")
model_list.append(knn)

SVC_grid = basic_df_train_modeling.load_model("SVC_grid")
model_list.append(SVC_grid)

LogisticRegression = basic_df_train_modeling.load_model("LogisticRegression")
model_list.append(LogisticRegression)

모델 DecisionTree이 DecisionTree 파일에서 불러와졌습니다.
모델 KNN이 KNN 파일에서 불러와졌습니다.
모델 SVC_grid이 SVC_grid 파일에서 불러와졌습니다.
모델 LogisticRegression이 LogisticRegression 파일에서 불러와졌습니다.


In [28]:
model_list[0]

('DecisionTree',
 Pipeline(steps=[('classifier',
                  DecisionTreeClassifier(max_depth=15, min_samples_split=10))]),
 0.8244755244755245)

In [29]:
basic_X

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,0,1,0,0.693558,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1,0,1,0,0.350080,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
886,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,0,0,0,-0.086274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,1,1,0,0,-0.086274,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
basic_y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 712, dtype: int64

In [31]:
# 앙상블 예측 메서드 추가
def ensemble_predict_(models, X_train, y_train, X_test):

    # VotingClassifier를 위한 estimators 리스트 생성
    estimators = [(name, model) for name, model, best_score in models]

    # VotingClassifier 생성
    ensemble_model = VotingClassifier(estimators=estimators, voting='hard')

    # 앙상블 모델 학습 (각 개별 모델은 이미 학습되었으므로, 재학습 없이 fit 필요 없음)
    # 그러나 VotingClassifier의 fit 메서드를 호출해야 합니다.
    ensemble_model.fit(X_train, y_train)

    # 예측 수행
    predictions = ensemble_model.predict(X_test)

    return predictions

In [32]:
from Modeling import  ensemble_predict
y_pred = ensemble_predict_(models=top_N_models, 
                           X_train=basic_X.values, 
                           y_train=basic_y.values, 
                           X_test=preprocessed_basic_df_test.values)



In [33]:
print(y_pred.shape)
y_pred

(418,)


array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [35]:
df_test_raw['Survived'] = y_pred
df_submit = df_test_raw[['PassengerId', 'Survived']]

In [40]:
df_submit.columns

Index(['PassengerId', 'Survived'], dtype='object')

In [41]:
df_submit.to_csv('submit.csv', index=False)