In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import csv as csv
%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv', header=0)
test_df = pd.read_csv('test.csv', header=0)

In [3]:
def get_titles(name):
    if ('.') in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

In [4]:
train_df['Titles'] = train_df.Name.map(lambda x: get_titles(x))

In [5]:
train_df['Titles'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Lady              1
Jonkheer          1
Don               1
Ms                1
Mme               1
Capt              1
the Countess      1
Sir               1
Name: Titles, dtype: int64

In [6]:
def abbr_titles(x):
    title = x['Titles']
    if title in ['Dr','Rev','Col','Major','Lady','Jonkheer','Don','Capt','the Countess','Sir','Dona']:
        return 'Rare'
    elif title in ['Ms','Mlle']:
        return 'Miss'
    elif title == 'Mme':
        return 'Mrs'
    else:
        return title

In [7]:
train_df['Titles'] = train_df.apply(abbr_titles, axis=1)

In [8]:
train_df['Titles'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Titles, dtype: int64

In [9]:
test_df['Titles'] = test_df['Name'].map(lambda x: get_titles(x))

In [10]:
test_df.Titles.value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Ms          1
Dr          1
Name: Titles, dtype: int64

In [11]:
test_df['Titles'] = test_df.apply(abbr_titles, axis=1)

In [12]:
test_df.Titles.value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
Rare        6
Name: Titles, dtype: int64

In [41]:
Ports = list(enumerate(np.unique(train_df.Embarked.dropna())))

In [42]:
Ports_dic = {name:i for i, name in Ports}

In [43]:
median_fare = np.zeros((3,3))

In [44]:
median_fare

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [21]:
median_age_train = train_df.Age.dropna().median()
median_age_test = test_df.Age.dropna().median()

In [25]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Titles         418 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [27]:
train_df.Age[train_df.Age.isnull()] = median_age_train
test_df.Age[test_df.Age.isnull()] = median_age_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [92]:
for i in range(0,3):
    for j in range(0,3):
        median_fare[i,j] = train_df[(train_df.Pclass == j+1) & (train_df.Embarked == Ports[i][1])]['Fare'].dropna().median()

In [93]:
median_fare

array([[ 78.2667,  24.    ,   7.8958],
       [ 90.    ,  12.35  ,   7.75  ],
       [ 52.    ,  13.5   ,   8.05  ]])

In [94]:
for i in range(0,3):
    for j in range(0,3):
        train_df.loc[(train_df.Pclass = j+1), 'Embarked'] = 

[ 78.2667  24.       7.8958]
[ 90.    12.35   7.75]
[ 52.    13.5    8.05]


In [96]:
train_df[train_df.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titles
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs


In [97]:
Ports

[(0, 'C'), (1, 'Q'), (2, 'S')]

78.2667

In [101]:
median_fare

array([[ 78.2667,  24.    ,   7.8958],
       [ 90.    ,  12.35  ,   7.75  ],
       [ 52.    ,  13.5   ,   8.05  ]])

In [169]:
fare_dic = {}
fare = list(np.unique(train_df.Embarked.dropna()))

for i in range(0,3):
    for j in range(0,3):
        fare_dic[median_fare[i, j]] = fare[i]
            
fare_dic

{7.75: 'Q',
 7.8958000000000004: 'C',
 8.0500000000000007: 'S',
 12.35: 'Q',
 13.5: 'S',
 24.0: 'C',
 52.0: 'S',
 78.2667: 'C',
 90.0: 'Q'}

In [168]:
median_fare

array([[ 78.2667,  24.    ,   7.8958],
       [ 90.    ,  12.35  ,   7.75  ],
       [ 52.    ,  13.5   ,   8.05  ]])

In [187]:
def find(myNumber):
    for i in range(0,3):
        return min(median_fare[i,:], key=lambda x: abs(x-myNumber))
        

for i in range(0,3):
    train_df.loc[(train_df.Pclass == i+1) & (train_df.Embarked.isnull()), 'Embarked'].apply(find).map(fare_dic)



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titles
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs


In [183]:
train_df[train_df.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titles
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs


In [159]:
train_df.Fare

array([ 78.2667,  24.    ,   7.8958])

In [178]:
series = pd.Series([x*1.2 for x in range(10)])   

min([1,2,3], key=lambda x: abs(x-series))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().