In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
# from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, RadiusNeighborsClassifier
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve
# from sklearn.tree import DecisionTreeClassifier 
# from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, LabelEncoder
# from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
import re
# from scipy.sparse import csr_matrix, hstack, vstack
%matplotlib inline

In [3]:
RANDOM_STATE = 5
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

In [4]:
train.count()

Survived    891
Pclass      891
Name        891
Sex         891
Age         714
SibSp       891
Parch       891
Ticket      891
Fare        891
Cabin       204
Embarked    889
dtype: int64

In [5]:
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy()

X_full = [X_train, X_test]

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    else:
        return ""

for df in X_full:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    df['Fare'] = df['Fare'].fillna(X_train['Fare'].median())

    age_mean = X_train['Age'].mean()
    age_std = X_train['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    df['Title'] = df['Name'].apply(get_title)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df.drop(['Name', 'Ticket', 'Cabin', 'SibSp'], axis=1, inplace=True)

In [6]:
X, y = X_train, y_train
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=RANDOM_STATE)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 241 to 868
Data columns (total 9 columns):
Pclass        596 non-null int64
Sex           596 non-null int64
Age           596 non-null int64
Parch         596 non-null int64
Fare          596 non-null int64
Embarked      596 non-null int64
FamilySize    596 non-null int64
IsAlone       596 non-null int64
Title         596 non-null int64
dtypes: int64(9)
memory usage: 46.6 KB


In [8]:
svc = SVC(random_state=RANDOM_STATE)
svc.fit(X_train, y_train)
y_preds = svc.predict(X_valid)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.8361788617886179


In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [55]:
svc = SVC(random_state=RANDOM_STATE)
cross_val_score(svc, X, y, cv=skf).mean()

0.8260221782709118

In [54]:
gs = GridSearchCV(estimator=svc, param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                            'class_weight': [None, 'balanced', {0:.6, 1:.4}], 
                                              'C': np.linspace(1., 20., 40)},
                  scoring='accuracy', cv=skf)
gs.fit(X_train, y_train);
gs.best_params_

{'C': 1.0, 'class_weight': None, 'kernel': 'rbf'}

In [56]:
svc = SVC(random_state=RANDOM_STATE)
svc.fit(X, y)
y_preds = svc.predict(X_test)

In [57]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    253
1    165
Name: Survived, dtype: int64

In [58]:
y_fin.to_csv('submissions/svc_001.csv', header=True, index_label='PassengerId') 

In [60]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked', 'FamilySize',
       'IsAlone', 'Title'],
      dtype='object')