In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

train = pd.read_csv('train_process.csv')
test = pd.read_csv('test_process.csv')

In [2]:
X_train = train.drop("Survived", axis=1)
y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()
X_train.shape, y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

## Model들 학습 (cross_validation 사용)

In [3]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model_logi = LogisticRegression()
model_logi.fit(X_train, y_train)
prediction = model_logi.predict(X_test)
acc_logi = cross_val_score(model_logi, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_logi.mean()))

cross-val-score.mean 
0.797


In [4]:
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
prediction = svc.predict(X_test)
acc_svc = cross_val_score(svc, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_svc.mean()))

cross-val-score.mean 
0.823


In [5]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
acc_knn = cross_val_score(knn, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_knn.mean()))

cross-val-score.mean 
0.781


In [6]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
prediction = gaussian.predict(X_test)
acc_gaussian = cross_val_score(gaussian, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_gaussian.mean()))

cross-val-score.mean 
0.772


In [7]:
# Perceptron
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
prediction = perceptron.predict(X_test)
acc_perceptron = cross_val_score(perceptron, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_perceptron.mean()))

cross-val-score.mean 
0.703


In [8]:
# Linear SVC
from sklearn.svm import LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
prediction = linear_svc.predict(X_test)
acc_linear_svc = cross_val_score(linear_svc, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_linear_svc.mean()))

cross-val-score.mean 
0.795




In [9]:
# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
prediction = sgd.predict(X_test)
acc_sgd = cross_val_score(sgd, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_sgd.mean()))

cross-val-score.mean 
0.753


In [10]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
prediction = decision_tree.predict(X_test)
acc_decision_tree = cross_val_score(decision_tree, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_decision_tree.mean()))

cross-val-score.mean 
0.794


In [11]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
prediction = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = cross_val_score(random_forest, X_train, y_train, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(acc_random_forest.mean()))

cross-val-score.mean 
0.800


## Models 결과 비교

In [12]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc.mean(), acc_knn.mean(), acc_logi.mean(), 
              acc_random_forest.mean(), acc_gaussian.mean(), acc_perceptron.mean(), 
              acc_sgd.mean(), acc_linear_svc.mean(), acc_decision_tree.mean()]})

models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Support Vector Machines,0.82266
3,Random Forest,0.800251
2,Logistic Regression,0.796887
7,Linear SVC,0.794627
8,Decision Tree,0.79351
1,KNN,0.781213
4,Naive Bayes,0.772205
6,Stochastic Gradient Decent,0.753129
5,Perceptron,0.702574


### Score가 가장 높은 SVM을 이용하여 submission.csv생성 후 제출

In [13]:
prediction = svc.predict(X_test)

In [14]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })
submission.to_csv('titanic_svc.csv', index=False)