In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')
p_id = testing_data['PassengerId']
data = pd.concat([training_data, testing_data])

data.drop('PassengerId', axis=1, inplace=True)
survived = data['Survived'].dropna()
data['Survived'].fillna(-1, inplace=True)

In [3]:
def one_hot(dataframe, name):
    dataframe = pd.concat([dataframe, pd.get_dummies(dataframe[name])
                           .rename(columns=lambda x: name + str(x))], axis=1)
    return dataframe.drop(name, axis=1)

In [4]:
def preprocessing(data) :
    # Cabin
    data['Cabin'].fillna('U0', inplace=True)
    data['CabinSection'] = LabelEncoder().fit_transform(data['Cabin'].map(lambda x: x[0]))
    
    data['CabinDistance'] = data['Cabin'].map(lambda x: x[1:])
    data['CabinDistance'] = data['CabinDistance'].map(lambda x: x.split(' ')[0])
    data['CabinDistance'].where(data['CabinDistance'] != '', '0', inplace=True)
    data['CabinDistance'] = data['CabinDistance'].map(lambda x: int(x))
    data['CabinDistance'] = StandardScaler().fit_transform(data['CabinDistance'].values.reshape(-1, 1))
    
    # Sex
    data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
    
    # Embarked
    data['Embarked'].fillna('S', inplace=True)
    data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
    
    # Name
    data['Name'] = data['Name'].map(lambda x: x.split(',')[1].split('.')[0])
    data['Name'] = LabelEncoder().fit_transform(data['Name'])
    
    # Fare
    data['Fare'].fillna(-1, inplace=True)
    medians = dict()
    for pclass in data['Pclass'].unique():
        median = data.Fare[(data["Fare"] != -1) & (data['Pclass'] == pclass)].median()
        medians[pclass] = median
    for index, row in data.iterrows():
        if row['Fare'] == -1:
            data.loc[index, 'Fare'] = medians[row['Pclass']]
    data['Fare'] = StandardScaler().fit_transform(data['Fare'].values.reshape(-1, 1))
    
    # Age
    data['Age'].fillna(-1, inplace=True)
    medians = dict()
    for title in data['Name'].unique():
        median = data.Age[(data["Age"] != -1) & (data['Name'] == title)].median()
        medians[title] = median
    for index, row in data.iterrows():
        if row['Age'] == -1:
            data.loc[index, 'Age'] = medians[row['Name']]
            
    data['Age'] = StandardScaler().fit_transform(data['Age'].values.reshape(-1, 1))
    
    for index, row in data.iterrows():
        ticket = row['Ticket']
        sibsp = row['SibSp']
        parch = row['Parch']

        if sibsp > 0 or parch > 0:
            ages = list()
            for index2, row2 in data[data['Ticket'] == ticket].iterrows():
                ages.append(row2['Age'])
            data.loc[index, 'Age2'] = min(ages)

        else:
            data.loc[index, 'Age2'] = row['Age']
            
    data['Age2'] = StandardScaler().fit_transform(data['Age2'].values.reshape(-1, 1))
    
    # Prefix
    died_titles = ('Don', 'Rev', 'Capt', 'Jonkheer')
    survived_titles = ('Mme', 'Ms', 'Lady', 'Sir', 'Mlle', 'the Countess')
    data['Title_Died'] = data['Name'].apply(lambda x: int(x in died_titles))
    data['Title_Survived'] = data['Name'].apply(lambda x: int(x in survived_titles))

    for title in ('Mr', 'Mrs', 'Miss', 'Master', 'Dr', 'Major', 'Col'):
        data['Title_{}'.format(title)] = data['Name'].apply(lambda x: int(x == title))
        
    data.drop('Name', axis=1, inplace=True)
    
    data = one_hot(data, 'Pclass')
    
    # Ticket
    data.drop('Ticket', axis=1, inplace=True)
    
    data.drop('Cabin', axis=1, inplace=True)
    
    
    
    return data

In [5]:
processed_data = preprocessing(data)
training_data2 = processed_data[data['Survived'] != -1]
testing_data2 = processed_data[data['Survived'] == -1]


training_data2.drop('Survived', axis=1, inplace=True)
testing_data2.drop('Survived', axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(training_data2, survived, test_size=0.20, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

models = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

for model in models:
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(score)

0.754189944134
0.782122905028
0.754189944134
0.810055865922
0.810055865922
0.798882681564
0.776536312849
0.743016759777
0.430167597765




In [7]:
models = [
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
]

index = 1
for model in models:
    model.fit(training_data2, survived)
    prediction = model.predict(testing_data2)
    
    submission = pd.DataFrame({
        "PassengerId": testing_data["PassengerId"],
        "Survived": prediction.astype(int)
    })
    submission.to_csv('submission{}.csv'.format(index), index=False)
#     np.savetxt('submission{}.csv'.format(index), prediction, delimiter=",")
    index += 1

