In [None]:
import os
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
#train.shape
#test.shape
train.head().T

In [None]:
#train.columns
train.info()

In [None]:
#lets drop some features
drop_features = ['Name', 'Ticket']
train = train.drop(drop_features, axis = 1)
test = test.drop(drop_features, axis = 1)


In [None]:
train.columns

In [None]:
train.isnull().sum()
train.Embarked.value_counts()

In [None]:
#Since there are only 2 missing Embarked columns. Lets fill it with the most occuring value
train["Embarked"] = train["Embarked"].fillna('S')
sns.factorplot("Embarked", 'Survived', data = train, figsize = (10,10));

In [None]:
fig, (axis1, axis2, axis3 ) = plt.subplots(1,3, figsize=(15,5))

sns.countplot(x='Embarked', data = train, ax = axis1)
sns.countplot(x = 'Survived', hue = 'Embarked', data = train, ax = axis2);
# looks like a lot of people from 'S' survived, but also the count who got in
# lets check the percentage of survived with respect to Embarked to get a batter idea

emb_perc = train[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean()
#emb_perc.head()
sns.barplot(x='Embarked', y = 'Survived', data = emb_perc, order = ['S','C','Q'], ax = axis3);

embark_dummies_train  = pd.get_dummies(train['Embarked'])
embark_dummies_train.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

train = train.join(embark_dummies_train)
test = test.join(embark_dummies_test)

train.drop(['Embarked'], axis=1,inplace=True)
test.drop(['Embarked'], axis=1,inplace=True)

More than 50% of people from 'C' survived!.
We can draw hypothesis from here, it is likely that more  Upper fare tickets boarded titanic from 'C' and also likely that 'S' has more Lower fare tickets  

Moving on, Lets check with fare and then decide if we should consider "Embarked" or not

In [None]:
#train.Fare.isnull().sum()
#test.Fare.isnull().sum()
test.Fare.fillna(test.Fare.median(), inplace = True)

#sns.distplot(train['Fare'], bins = 25,kde = False );
train['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,100))

fare_survived = train['Fare'][train['Survived'] == 1]
fare_not_survived = train['Fare'][train['Survived'] == 0]

avg_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])

avg_fare.index.names = std_fare.index.names = ["Survived"]
avg_fare.plot(yerr=std_fare,kind='bar', legend = False)

Age is an important factor in predicting, but a lot Age values are missing from the training data.

In [None]:
#train.Age.describe()
train_age_nancount = train["Age"].isnull().sum() #177 missing values
test_age_nancount = test["Age"].isnull().sum() #86 missing values

avg_age_titanic = train.Age.mean()
std_age_titanic = train.Age.std()

avg_age_test = test.Age.mean()
std_age_test = test.Age.std()

random_age = np.random.randint(avg_age_titanic - std_age_titanic, avg_age_titanic + std_age_titanic, size = train_age_nancount)
random_age_test = np.random.randint(avg_age_test - std_age_test, avg_age_test + std_age_test, size = test_age_nancount)

#as above dtypes Age is Float here, lets convert to int 
#lets also check if age is being predicted well by comparing the before and after age imputations 

fig, (axis1, axis2) = plt.subplots(1,2, figsize = (15,5))

# we are more concerned about the distribution of age here

train['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

train['Age'][np.isnan(train["Age"])] = random_age
train["Age"] = train["Age"].astype(int)

test["Age"][np.isnan(test['Age'])] = random_age_test
test["Age"] = test["Age"].astype(int)

train["Age"].hist(bins = 70, ax = axis2)
axis1.set_title = ("original age values")
axis2.set_title = ("age values after imputing")

In [None]:
facet = sns.FacetGrid(train, hue ='Survived', aspect = 5)
facet.map(sns.kdeplot, 'Age', shade = True)
facet.set(xlim = (0, train['Age'].max()))
facet.add_legend()

In [None]:
fig, axis = plt.subplots(1,1, figsize = (20,8))
avg_age = train[["Age","Survived"]].groupby(['Age'], as_index = False).mean()
sns.barplot(x = 'Age', y = 'Survived', data = avg_age);

In [None]:
print(train.columns)
train.Cabin.unique()

Cabins also have a lot of misssing values, also it is correlated to fares, so we can skip this feature

In [None]:
train.drop("Cabin", axis = 1, inplace = True)
test.drop("Cabin", axis = 1, inplace = True)

In [None]:
#coming to PClass

sns.factorplot("Pclass","Survived", data = train);
#Pclass =3 has the worst survival rate

# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies_titanic  = pd.get_dummies(train['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train.drop(['Pclass'],axis=1,inplace=True)
test.drop(['Pclass'],axis=1,inplace=True)

train = train.join(pclass_dummies_titanic)
test = test.join(pclass_dummies_test)

In [None]:
#We can combine Sibsp and Parch into one feature, 
train['Familysize'] = train['Parch'] + train["SibSp"]
test['Familysize'] = test['Parch'] + test["SibSp"]

train["Familysize"].value_counts()

train.loc[train["Familysize"] == 0 , 'Fsize'] = 'single'
train.loc[(train["Familysize"] > 1)  &  (train["Familysize"] < 5) , "Fsize"] = 'small'
train.loc[train["Familysize"] >4, "Fsize"] = 'large'

test.loc[test['Familysize'] == 0,'Fsize'] = 'single'
test.loc[(test["Familysize"] > 1) & (train["Familysize"] < 5) , 'Fsize'] = 'small'
test.loc[test['Familysize'] > 4, 'Fsize'] = 'large'


sns.factorplot(x = 'Fsize', y ="Survived", data = train)


In [None]:
#large families have less percentage of survival and hence we can drop them using dummies

family_dummies_train = pd.get_dummies(train['Fsize'])
family_dummies_train.columns = ['single', 'small', 'large']
family_dummies_train.drop('large', axis = 1, inplace = True)

family_dummies_test = pd.get_dummies(test["Fsize"])
family_dummies_test.columns = ['single','small','large']
family_dummies_test.drop('large', axis = 1, inplace = True)

train.drop('Fsize', axis = 1, inplace = True)
test.drop("Fsize", axis = 1, inplace = True)

train = train.join(family_dummies_train)
test = test.join(family_dummies_test)

In [None]:
train.columns

In [None]:
#drop Parch, SibSp, Familysize
train.drop('Parch', axis = 1, inplace = True)
test.drop('Parch', axis = 1, inplace = True)

train.drop("SibSp", axis = 1, inplace = True)
test.drop("SibSp", axis = 1, inplace = True)

train.drop("Familysize", axis = 1, inplace = True)
test.drop("Familysize", axis = 1, inplace = True)

In [None]:
train.columns

In [None]:
#Sex plays a very important role here, 
#we saw from the graph that children 15 and below had higher chances of survival, so lets get them into another category
def sort_person(person):
    Age, Sex = person
    return 'child' if Age < 16 else Sex
train['Person']  = train[['Age', 'Sex']].apply(sort_person, axis = 1)
test['Person'] = train[['Age', 'Sex']].apply(sort_person, axis = 1)

train.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)


person_dummies_train  = pd.get_dummies(train['Person'])
person_dummies_train.columns = ['Child','Female','Male']
person_dummies_train.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

train = train.join(person_dummies_train)
test    = test.join(person_dummies_test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

sns.countplot(x='Person', data=train, ax=axis1)

# average of survived for each Person(male, female, or child)
person_perc = train[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male','female','child'])

train.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)

In [None]:
test.columns

It is proved in another hypothesis that Rich people in early 20th century have long names, and hence higher class and hence better chance of survival, but since we are already considering Fare, Pclass. It maybe safe to drop name too. 

Now, we have created the data we want, lets build some models and see how they perform 


In [None]:
train.columns

In [None]:
feature_cols = ['Age', 'Fare', 'C', 'Q', 'Class_1','Class_2', 'single', 'small', 'Child', 'Female']
X_for_training = train[feature_cols]
y_for_training = train['Survived']
X_test = test [feature_cols]

X_train, X_test, y_train, y_test = train_test_split(X_for_training, y_for_training, test_size=0.3,random_state=0)


In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('LSVC', LinearSVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('ADB', AdaBoostClassifier()))

In [None]:
'''
results = []
names = []
final =[]
#print (models)
for name, model in models:
    mod = model
    mod.fit(X_train,y_train)
    predictions = mod.predict(X_test)
    zing = accuracy_score(y_test, predictions)
    cv_results = cross_val_score(mod, X_for_training, y_for_training, cv= 10)
    final.append((name, zing))

    msg = "%s: = 'mean: '%f, 'std:' %f, 'acc:' %f " % (name, cv_results.mean(), cv_results.std(), zing)
    print(msg)
    
    '''

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
predictions = lr.predict(X_test)
cv_results = cross_val_score(mod, X_for_training, y_for_training, cv= 10)
zing = accuracy_score(y_test, predictions)


In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": zing
    })
submission.to_csv('titanic.csv', index=False)