# VARIABLE DESCRIPTIONS:
* survival:        Survival (0 = No; 1 = Yes)
* pclass:          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* name:            Name
* sex:             Sex
* age:             Age
* sibsp:           Number of Siblings/Spouses Aboard
* parch:           Number of Parents/Children Aboard
* ticket:          Ticket Number
* fare:            Passenger Fare
* cabin:           Cabin
* embarked:        Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
%matplotlib inline

## Data Cleaner

In [2]:
def data_cleaner(filename):
    df = pd.read_csv('data/' + filename)

    # Gender: Females = 0, Males = 1
    df['Gender'] = df['Sex'].map({'female': 0, 'male': 1})

    # EmbarkedInt: C = 0, Q = 1, S = 2, nan = 0
    df['EmbarkedInt'] = df['Embarked'].map({'C': 1, 'Q': 2, 'S': 3}).fillna('0')

    # AgeIsNull: 1 = Age was NaN, 0 = Age was present
    df['AgeIsNull'] = df['Age'].isnull().astype(int)

    # AgeFill: 
    median_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = df[(df['Gender'] == i) & \
                                  (df['Pclass'] == j+1)]['Age'].dropna().median()

    df['AgeFill'] = df['Age']

    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),\
                    'AgeFill'] = median_ages[i,j]
    
    # FamilySize
    df['FamilySize'] = df['SibSp'] + df['Parch']

    # FareFill
    df['FareFill'] = df['Fare'].fillna('0')
    
    median_fare = df[['Pclass','Fare']][df.Fare > 0].groupby('Pclass').median()
    
    for i in range(1,4):
        df.loc[(df.Fare == 0) & (df.Pclass == i), 'FareFill'] = median_fare.Fare[i]    
    
    pd.options.mode.chained_assignment = None

    # Title:
    df['Title'] = df['Name']
    df['Title'] = df['Title'].map(lambda x: x.rsplit(',')[1].rsplit('.')[0].strip())
    df['TitleInt'] = df['Title'].apply(lambda x: 0 if x in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Mr'] else 1)


    df = df.drop(['Age', 'Name', 'Fare', 'Sex', 'Ticket', 'Cabin', 'AgeIsNull',\
                  'SibSp', 'Parch', 'EmbarkedInt', 'Embarked', 'Title', 'TitleInt'], 1)
    
    return df

In [9]:
train_df = data_cleaner('train.csv')
test_df = data_cleaner('test.csv')

train_df = train_df.drop(['AgeFill', 'FamilySize', 'FareFill'], 1)
test_df = test_df.drop(['AgeFill', 'FamilySize', 'FareFill'], 1)

print(train_df.head(3))
print(test_df.head(1))

   PassengerId  Survived  Pclass  Gender
0            1         0       3       1
1            2         1       1       0
2            3         1       3       0
   PassengerId  Pclass  Gender
0          892       3       1


In [8]:
## Prepare DFs for sklearn

X_train = train_df.values[:,2:]
y_train = train_df.values[:,1]

X_train
# X_test = test_df.values[:,1:].astype(float)

array([[3, 1],
       [1, 0],
       [3, 0],
       ..., 
       [3, 0],
       [1, 1],
       [3, 1]])

In [10]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train, y_train)

# output = clf.predict(X_test)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
submission = pd.DataFrame({'PassengerId': test_df.PassengerId, \
                           'Survived': output.astype(int)})

In [None]:
submission.to_csv('SVC_default.csv', index=False)

In [81]:
def prettyPicture(clf, X, y):
    h = .01  # step size in the mesh

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    # x_min = 0.0; x_max = 1.0
    # y_min = 0.0; y_max = 1.0

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], cmap=plt.cm.Paired)#, c=Y)
    plt.axis('tight')
    plt.ylabel('Gender')
    plt.xlabel('PClass')
    plt.show()

    

from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# tmp = train_df[['Pclass',  'Gender',  'AgeFill',  'FamilySize', 'FareFill']]

X = train_df[['Pclass',  'Gender']].values
Y = train_df['Survived'].values

x = test_df[['Pclass',  'Gender']].values

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=72)



# we create an instance of SVM and fit out data.
clf = SVC(C=1)
clf.fit(X, Y)

pred = clf.predict(x)
# print(accuracy_score(y_test, pred))

# prettyPicture(clf, X_test, y_test)



In [84]:
submission = pd.DataFrame({'PassengerId': test_df.PassengerId, \
                           'Survived': pred.astype(int)})