In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


%matplotlib inline
from collections import Counter

In [2]:
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    

def clean_data(csv):
    df = pd.read_csv(csv,index_col='PassengerId')
    df["Embarked"] = df["Embarked"].fillna("S")
    df['FamilyCount'] = df.SibSp + df.Parch
    
    embark_dummies_titanic  = pd.get_dummies(df['Embarked'])
    embark_dummies_titanic.drop(['S'], axis=1, inplace=True)
    df = df.join(embark_dummies_titanic)
    
    df['Person'] = df[['Age','Sex']].apply(get_person,axis=1)
    person_dummies_titanic  = pd.get_dummies(df['Person'])
    person_dummies_titanic.columns = ['Child','Female','Male']
    person_dummies_titanic.drop(['Male'], axis=1, inplace=True)
    df = df.join(person_dummies_titanic)

    pclass_dummies_titanic  = pd.get_dummies(df['Pclass'])
    pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
    pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)
    df = df.join(pclass_dummies_titanic)
    
    count_nan_age_titanic = df["Age"].isnull().sum()
    rand_1 = np.random.randint(df["Age"].mean() - df["Age"].std(), df["Age"].mean() + df["Age"].std(), size = count_nan_age_titanic)
    df["Age"][np.isnan(df["Age"])] = rand_1
    

    df['Name_length'] = df['Name'].apply(lambda i: len(i))

    df_reg = df.drop(['Name','Ticket','Sex','Cabin','SibSp','Parch','Embarked','Cabin','Person','Name_length','C','Q'], axis=1).copy()
    
    return df_reg

In [3]:
train_df = clean_data('train.csv')
train_df.shape
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,Survived,Age,Fare,FamilyCount,Child,Female,Class_1,Class_2
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,22.0,7.25,1,0,0,0,0
2,1,38.0,71.2833,1,0,1,1,0
3,1,26.0,7.925,0,0,1,0,0
4,1,35.0,53.1,1,0,1,1,0
5,0,35.0,8.05,0,0,0,0,0


In [4]:
test_df = clean_data('test.csv')
print test_df.shape

(418, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
train_df = train_df.drop([c for c in train_df.columns if c not in test_df.columns][1:], axis = 1)
train_df.shape

(891, 8)

In [6]:
test_df = test_df.drop([c for c in test_df.columns if c not in train_df.columns],axis = 1)
test_df.shape

(418, 7)

In [7]:
1-train_df.Survived.mean()

0.6161616161616161

In [32]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2008)

In [9]:
target = 'Survived'
features = [c for c in train_df.columns if c != target]

y_train = train_df[target]
X_train = train_df[features]

In [10]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures()
Xp=poly.fit_transform(X_train)

In [27]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
Xs = ss.fit_transform(X_train)

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [13]:
knn = KNeighborsClassifier()
params = {
        'n_neighbors': np.arange(5,25,2),
        'weights': ['uniform','distance'],
        'algorithm': ['ball_tree','kd_tree','brute'],
}

In [33]:
from sklearn import svm, linear_model, datasets
clf = svm.SVC(probability=True)

gamma_range = np.logspace(-5, 2, 10)
C_range = np.logspace(-3, 2, 10)
kernel_range = ['linear']

param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='accuracy', verbose=1)
grid.fit(Xs, y_train)
print grid.best_params_
print grid.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'kernel': 'linear', 'C': 0.1668100537200059, 'gamma': 1.0000000000000001e-05}
0.82379349046


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  2.5min finished


In [37]:
from sklearn.metrics import confusion_matrix


pred = grid.predict(Xs)
print confusion_matrix(y_train, pred)
pred_probs = grid.predict_proba(Xs)

[[486  63]
 [ 91 251]]


In [256]:
gs = GridSearchCV(knn,param_grid=params,cv=kf)

In [268]:
gs.fit(Xs,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=2008, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 5,  7,  9, 11, 13, 15, 17, 19, 21, 23]), 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [269]:
gs.best_score_

0.81481481481481477

In [259]:
gs.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 13, 'weights': 'uniform'}

In [17]:
test_df.fillna(value =test_df["Fare"].median(), inplace = True)

In [38]:
X_test = ss.transform(test_df[features])

In [None]:
pred = lr.predict(X_test)
prob = lr.predict_proba(X_test)[:,1]

In [None]:
tp = np.sum((x == 1)&(pred == 1))
fp = np.sum((x == 0)&(pred == 1))
tn = np.sum((x == 0)&(pred == 0))
fn = np.sum((x == 1)&(pred == 0))
tp,fp,tn,fn

In [39]:
x = grid.predict(X_test)
test_df['Survived'] = x

In [40]:
test_df[['Survived']].to_csv('sub7-titanic.csv')

In [1]:
21*6

126

In [2]:
5**3+1

126