In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

Loading the Data,Combining and removing unnecessary columns

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# extracting and then removing the targets from the training data 
targets = train.Survived
train.drop(['Survived'], 1, inplace=True)
    

# merging train data and test data for future feature engineering
# we'll also remove the PassengerID since this is not an informative feature
combined = train.append(test)
combined.reset_index(inplace=True)
combined.drop(['index', 'PassengerId'], inplace=True, axis=1)

In [3]:
targets.shape

(891,)

# Processing Name to Generate Titles

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.split.html
<br>
Description of pandas.series.str.split()
<br>
This method will be used to scrape the titles from Name column and to process the name column
<br>
*combined['Title'] = combined['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]* -This code did not work when combining dataframes using append method

In [4]:
titles = set()
for name in train['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

In [5]:
print(titles)

{'Miss', 'Don', 'Major', 'Mr', 'Dr', 'Ms', 'Sir', 'Mlle', 'the Countess', 'Col', 'Rev', 'Capt', 'Jonkheer', 'Mme', 'Mrs', 'Lady', 'Master'}


As We can see there are a lot of titles ,we will reduce them to five categories.
<br>
- Mr 
<br> 
- Miss
<br> 
- Mrs
<br> 
- Master
<br> 
- Officer
<br> 
- Royalty

In [6]:
Title_Dictionary = {
    "Capt": "Royalty",
    "Col": "Royalty",
    "Major": "Royalty",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Royalty",
    "Rev": "Royalty",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles():
    # we extract the title from each name
    combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated title
    # we map each title
    combined['Title'] = combined.Title.map(Title_Dictionary)
    return combined

In [7]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles():
    # we extract the title from each name
    combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated title
    # we map each title
    combined['Title'] = combined.Title.map(Title_Dictionary)
    return combined

In [8]:
combined = get_titles()

In [9]:
combined[combined['Title'].isnull()]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,


In [10]:
combined['Title'].fillna('Royalty',inplace=True)

In [11]:
pd.crosstab(combined['Title'],combined['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,61
Miss,262,0
Mr,0,757
Mrs,200,0
Officer,1,22
Royalty,3,3


# Processing Age

In [12]:
print(combined['Age'].iloc[:891].isnull().sum())

177


**So we Have 177 null values in the train set which have to be imputed**

In [13]:
mediantrain = combined.iloc[:891].groupby(['Sex','Pclass','Title']).median()['Age'].reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

In [14]:
mediantrain.head(5)

Unnamed: 0,Sex,Pclass,Title,Age
0,female,1,Miss,30.0
1,female,1,Mrs,40.0
2,female,1,Officer,49.0
3,female,1,Royalty,40.5
4,female,2,Miss,24.0


This DataFrame will help us impute the age vaues
<br>
How to iterate through rows in a dataframe?
<br>
https://www.geeksforgeeks.org/different-ways-to-iterate-over-rows-in-pandas-dataframe/

In [15]:
#example what the next block of code does
combined[(combined['Sex']=='male')&(combined['Title']=='Royalty')&(combined['Pclass']==1)]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
30,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C,Royalty
599,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49.0,1,0,PC 17485,56.9292,A20,C,Royalty
822,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S,Royalty


In [16]:
def fill_age(row):
    condition = (
        (mediantrain['Sex'] == row['Sex']) & 
        (mediantrain['Title'] == row['Title']) & 
        (mediantrain['Pclass'] == row['Pclass'])
    ) 
    return mediantrain[condition]['Age'].values[0]


def process_age():
    global combined
    # a function that fills the missing values of the Age variable
    combined['Age'] = combined.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return combined

In [17]:
combined = process_age()

# Processing Fares

Here we simply fill the null values of fare with the mean in the train set and ignore the values which have zero fare for first iteration,After testing our model for the first time ,we will come back and check again if treating those values would make a difference

In [18]:
combined.Fare.fillna(combined.iloc[:891].Fare.mean(), inplace=True)

In [19]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1309 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 112.6+ KB


# Processing Titles

We will use pandas get dummies method to do one-hot encoding of Titles column and drop the names column
<br> If you still want to use scikit learn's one hot method here is a very usefull link
<br>http://www.insightsbot.com/blog/McTKK/python-one-hot-encoding-with-scikit-learn

In [20]:
def process_names():
    global combined
    # we clean the Name variable
    combined.drop('Name', axis=1, inplace=True)
    
    # encoding in dummy variable
    titles_dummies = pd.get_dummies(combined['Title'], prefix='Title')
    combined = pd.concat([combined, titles_dummies], axis=1)
    
    # removing the title variable
    combined.drop('Title', axis=1, inplace=True)
    
    return combined

In [21]:
combined = process_names()

# Processing Embarked

In [22]:
def process_embarked():
    global combined
    # two missing embarked values - filling them with the most frequent one in the train  set(S)
    combined.Embarked.fillna('S', inplace=True)
    # dummy encoding 
    embarked_dummies = pd.get_dummies(combined['Embarked'], prefix='Embarked')
    combined = pd.concat([combined, embarked_dummies], axis=1)
    combined.drop('Embarked', axis=1, inplace=True)
    return combined

In [23]:
combined = process_embarked()

In [24]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,3,male,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1
1,1,female,38.0,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1
3,1,female,35.0,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1
4,3,male,35.0,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1


# Feature Engineering

- **Creating Age Band**

In [25]:
combined['Age'] = pd.qcut(combined['Age'],9)
combined['Age'].head()

0    (20.0, 24.0]
1    (32.0, 39.0]
2    (24.0, 26.0]
3    (32.0, 39.0]
4    (32.0, 39.0]
Name: Age, dtype: category
Categories (9, interval[float64]): [(0.169, 17.0] < (17.0, 20.0] < (20.0, 24.0] < (24.0, 26.0] ... (28.0, 32.0] < (32.0, 39.0] < (39.0, 47.0] < (47.0, 80.0]]

In [26]:
'''Age_bin = pd.get_dummies(combined['Age'])
combined = pd.concat([combined, Age_bin], axis=1)
combined.drop(["Age"], axis=1, inplace=True)'''

combined['Age'] = LabelEncoder().fit_transform(combined['Age'])

- **Creating Fare Band**

In [27]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,3,male,2,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1
1,1,female,6,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0
2,3,female,3,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1
3,1,female,6,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1
4,3,male,6,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1


In [28]:
combined['Ticket_count'] = combined.Ticket.apply(lambda x: combined[combined['Ticket']==x].shape[0] ) #Counts how many people have the same ticket
combined['Fare_tickect']= combined.apply(lambda x: x.Fare/x.Ticket_count,axis=1 )
combined['Fare'] = pd.qcut(combined['Fare_tickect'],12)

In [29]:
combined['Fare'] = LabelEncoder().fit_transform(combined['Fare'])

In [30]:
combined.drop("Fare_tickect",axis=1,inplace=True)

In [31]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S,Ticket_count
0,3,male,2,1,0,A/5 21171,2,,0,0,1,0,0,0,0,0,1,1
1,1,female,6,1,0,PC 17599,11,C85,0,0,0,1,0,0,1,0,0,2
2,3,female,3,0,0,STON/O2. 3101282,5,,0,1,0,0,0,0,0,0,1,1
3,1,female,6,1,0,113803,9,C123,0,0,0,1,0,0,0,0,1,2
4,3,male,6,0,0,373450,5,,0,0,1,0,0,0,0,0,1,1


# Processing Pclass And Sex

In [32]:
Pclass_bin = pd.get_dummies(combined['Pclass'],prefix ='Class')
Sex_bin = pd.get_dummies(combined['Sex'],drop_first=True,prefix ='Sex')
combined = pd.concat([combined, Pclass_bin,Sex_bin], axis=1)

In [33]:
combined.drop(['Sex','Pclass'],axis=1,inplace=True)

# Processing Family

In [34]:
def process_family():
    
    global combined
    # introducing a new feature : the size of families (including the passenger)
    combined['FamilySize'] = combined['Parch'] + combined['SibSp'] + 1
    
    # introducing other features based on the family size
    combined['Singleton'] = combined['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    combined['SmallFamily'] = combined['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    combined['LargeFamily'] = combined['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    return combined

In [35]:
combined = process_family()

In [36]:
combined.drop(['SibSp','Parch','FamilySize'],axis=1,inplace=True)

# Processing Cabin

In [37]:
combined.head()

Unnamed: 0,Age,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,...,Embarked_Q,Embarked_S,Ticket_count,Class_1,Class_2,Class_3,Sex_male,Singleton,SmallFamily,LargeFamily
0,2,A/5 21171,2,,0,0,1,0,0,0,...,0,1,1,0,0,1,1,0,1,0
1,6,PC 17599,11,C85,0,0,0,1,0,0,...,0,0,2,1,0,0,0,0,1,0
2,3,STON/O2. 3101282,5,,0,1,0,0,0,0,...,0,1,1,0,0,1,0,1,0,0
3,6,113803,9,C123,0,0,0,1,0,0,...,0,1,2,1,0,0,0,0,1,0
4,6,373450,5,,0,0,1,0,0,0,...,0,1,1,0,0,1,1,1,0,0


In [38]:
train_cabin, test_cabin = set(), set()

for c in combined.iloc[:891]['Cabin']:
    try:
        train_cabin.add(c[0])
    except:
        train_cabin.add('U')
        
for c in combined.iloc[891:]['Cabin']:
    try:
        test_cabin.add(c[0])
    except:
        test_cabin.add('U')

In [39]:
combined['Deck'] = combined['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

In [40]:
combined['Deck'] = combined['Deck'].replace(['A', 'B', 'C'], 'ABC')
combined['Deck'] = combined['Deck'].replace(['D', 'E'], 'DE')
combined['Deck'] = combined['Deck'].replace(['F', 'G'], 'FG')

combined['Deck'].value_counts()

M      1014
ABC     181
DE       87
FG       26
T         1
Name: Deck, dtype: int64

In [41]:
combined.drop(['Cabin'], inplace=True, axis=1)

In [42]:
cabin_dummies = pd.get_dummies(combined['Deck'], prefix='Deck')    
combined = pd.concat([combined, cabin_dummies], axis=1)

In [43]:
combined.drop('Deck', axis=1, inplace=True)

In [44]:
print(train_cabin)

{'B', 'E', 'G', 'D', 'C', 'T', 'U', 'F', 'A'}


In [45]:
combined.drop('Ticket',axis=1,inplace=True)

In [46]:
combined.head()

Unnamed: 0,Age,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,...,Class_3,Sex_male,Singleton,SmallFamily,LargeFamily,Deck_ABC,Deck_DE,Deck_FG,Deck_M,Deck_T
0,2,2,0,0,1,0,0,0,0,0,...,1,1,0,1,0,0,0,0,1,0
1,6,11,0,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,3,5,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0
3,6,9,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,6,5,0,0,1,0,0,0,0,0,...,1,1,1,0,0,0,0,0,1,0


# It's Modelling Time

In [47]:
trainc = combined.iloc[:891]
testc = combined.iloc[891:]

https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/

In [48]:
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

In [49]:
logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()

models = [logreg, logreg_cv, rf, gboost]

In [50]:
logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()

models = [logreg, logreg_cv, rf, gboost]

In [51]:
for model in models:
    print('Cross-validation of : {0}'.format(model.__class__))
    score = compute_score(clf=model, X=trainc, y=targets, scoring='accuracy')
    print('CV score = {0}'.format(score))
    print('****')

Cross-validation of : <class 'sklearn.linear_model.logistic.LogisticRegression'>
CV score = 0.827202976678735
****
Cross-validation of : <class 'sklearn.linear_model.logistic.LogisticRegressionCV'>
CV score = 0.8249557147401558
****
Cross-validation of : <class 'sklearn.ensemble.forest.RandomForestClassifier'>
CV score = 0.8070092882902149
****
Cross-validation of : <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
CV score = 0.8249304646060146
****


In [55]:
run_gs = False

if run_gs:
    parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50, 10],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [2, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],
                 }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(n_splits=5)

    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               verbose=1
                              )

    grid_search.fit(trainc, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    
else: 
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
    model = RandomForestClassifier(**parameters)
    model.fit(trainc, targets)

In [56]:
output = model.predict(testc).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('submission.csv', index=False)