In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [29]:
# 学習データの読み込み
train_df = pd.read_csv("train.csv").replace("male",0).replace("female",1)

In [30]:
# 欠損値の確認
train_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [31]:
# 欠損値を補完
train_df["Age"].fillna(train_df.Age.median(), inplace=True)
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
train_df2 = train_df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)

In [32]:
# 学習データとテストデータに分割
data = train_df2.values
train_df = data[:round(len(data)*0.8),:]
test_df = data[round(len(data)*0.8):,:]

In [33]:
x_train = train_df[:,2:]
y_train = train_df[:,1]

In [34]:
# Pipeline作成
estimators = [('pca', PCA()),
              ('svm', svm.SVC())]
pl = Pipeline(estimators)

In [39]:
parameters = {
    "pca__n_components" : range(2, 3),
    "svm__kernel" : ["linear", "rbf"],
    'svm__C': np.logspace(0, 2, 2).tolist(),
    "svm__gamma": np.logspace(-3, 0, 2).tolist()
}

In [40]:
clf = GridSearchCV(pl, parameters, n_jobs=-1)
clf.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'pca__n_components': range(2, 3), 'svm__kernel': ['linear', 'rbf'], 'svm__C': [1.0, 100.0], 'svm__gamma': [0.001, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [41]:
clf.best_estimator_.get_params()

{'pca': PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False),
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': 2,
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'steps': [('pca',
   PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False)),
  ('svm', SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False))],
 'svm': SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'svm__C': 100.0,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'sv

In [12]:
# テストデータで評価
output = forest.predict(test_df[:, 2:])
accuracy_score(output, test_df[:,1])

0.8258426966292135

In [11]:
train_df2.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,FamilySize
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.034212,-0.040143
Survived,-0.005007,1.0,-0.338481,0.543351,-0.06491,0.016639
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.339898,0.065997
Sex,-0.042939,0.543351,-0.1319,1.0,-0.081163,0.200988
Age,0.034212,-0.06491,-0.339898,-0.081163,1.0,-0.245619
FamilySize,-0.040143,0.016639,0.065997,0.200988,-0.245619,1.0
