In [None]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

data_train = pd.read_csv("../input/titanic/train.csv",engine = 'python')
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set(alpha=0.2)

plt.subplot2grid((2,3), (0,0))
data_train.Survived.value_counts().plot(kind='bar')
plt.title("survival status")
plt.ylabel("number of people")

plt.subplot2grid((2,3), (0,1))
data_train.Pclass.value_counts().plot(kind='bar')
plt.title("class distribution")
plt.ylabel("number of people")

plt.subplot2grid((2,3), (0,2))
plt.scatter(data_train.Survived, data_train.Age, s=1)
plt.ylabel("age")
plt.grid(b=True, which='major', axis='y')
plt.title("age by survived")

plt.subplot2grid((2,3), (1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.xlabel("age")
plt.title("age by class")
plt.legend(("first class", "second class", "third class"))

plt.subplot2grid((2,3), (1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.ylabel("number of people")
plt.title("number of ppl in each embarkation")
plt.show()


Around 300 people survived. Lots of people are in the third class. The age ranges are very large on both survived and unsurvided. Most people on second and third class are around 20 years old, while people on the first class are largely around 40. Around 70% of people are on embarkation S, and few people are on C and Q embarkations. 

In [None]:
fig = plt.figure()
fig.set(alpha=0.2)
survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
df = pd.DataFrame({'Unsurvived': survived_0, 'Survived': survived_1})
df.plot(kind='bar', stacked=True)
plt.title("survival status by class")
plt.xlabel("passengers class")
plt.ylabel("number of people")
plt.show()

People in the first class have a greater chance to survive

In [None]:
fig = plt.figure()
fig.set(alpha=0.2)
survived_1 = data_train.Sex[data_train.Survived == 1].value_counts()
survived_0 = data_train.Sex[data_train.Survived == 0].value_counts()
df = pd.DataFrame({'Unsurvived': survived_0, 'Survived': survived_1})
df.plot(kind='bar', stacked=True)
plt.title("survival status by gender")
plt.xlabel("passengers gender")
plt.ylabel("number of people")
plt.show()

Female has a greater chance to survive


In [None]:
fig = plt.figure()
fig.set(alpha=0.65)

ax1 = fig.add_subplot(141)
# survival status for female at low-level class
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts().sort_index().plot(kind='bar', color='#FA2479')
ax1.set_xticklabels(["unsurvived", "survived"])
ax1.legend(["female/low cl"], loc="best")

ax2 = fig.add_subplot(142, sharey=ax1)
# survival status for female at high-level class
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts().sort_index().plot(kind='bar', color='pink')
ax2.set_xticklabels(["unsurvived", "survived"])
ax2.legend(["female/high cl"], loc="best")

ax3 = fig.add_subplot(143, sharey=ax1)
# survival status for male at low-level class
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts().sort_index().plot(kind='bar', color='lightblue')
ax3.set_xticklabels(["unsurvived", "survived"])
ax3.legend(["male/low cl"], loc="best")

ax4 = fig.add_subplot(144, sharey=ax1)
# survival status for male at high-level class
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts().sort_index().plot(kind='bar', color='steelblue')
ax4.set_xticklabels(["unsurvived", "survived"])
ax4.legend(["male/high cl"], loc="best")

plt.title("survival status by class and gender")
plt.show()



In [None]:
fig = plt.figure()
fig.set(alpha=0.65)

survival_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
survival_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df = pd.DataFrame({'Unsurvived': survival_0,'Survived': survival_1})
df.plot(kind='bar', stacked=True)
plt.xlabel("type of embarkation")
plt.ylabel("number of people")
plt.title("survival status by embarkations")

plt.show()

In [None]:
sib = data_train.groupby(['SibSp', 'Survived'])
df = pd.DataFrame(sib.count()['PassengerId'])
print(df)

par = data_train.groupby(['Parch', 'Survived'])
df = pd.DataFrame(par.count()['PassengerId'])
print(df)

There is no strong correlations between number of siblings/parents, childs and survived

Because there are only 204 values on the "cabin" part, let's see it's distribution first.

In [None]:
data_train.Cabin.value_counts()

In [None]:
fig = plt.figure()
fig.set(alpha=0.65)

null = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
not_null = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
df = pd.DataFrame({'Cabin': not_null, 'NoCabin': null}).transpose()
df.plot(kind='bar', stacked=True)
plt.ylabel("number of people")
plt.title("survival status by having and not having cabin")
plt.show()

People having cabins seem to have a greater chance to survive

Because there are some missing values in "age" and "cabin" part, below I used Ramdom Forest in scikit-learn to fit the data of missing age and cabin.

In [None]:
from sklearn.ensemble import RandomForestRegressor

def set_missing_age(df):
    age_df = df[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
    known_age = age_df[age_df.Age.notnull()].values
    unknown_age = age_df[age_df.Age.isnull()].values
    # our target variable
    y = known_age[:, 0]
    x = known_age[:, 1:]
    rfr = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=2000)
    rfr.fit(x,y)
    
    predict_age = rfr.predict(unknown_age[:, 1::])
    
    #fit our prodicted ages into the NAN values
    df.loc[df.Age.isnull(), 'Age'] = predict_age
    return df, rfr

def set_missing_cabin(df):
    df.loc[df.Cabin.notnull(), 'Cabin' ] = "Yes"
    df.loc[df.Cabin.isnull(), 'Cabin' ] = "No"
    return df

data_train, rfr = set_missing_age(data_train)
data_train = set_missing_cabin(data_train)

data_train.head(10)
    

While doing analysis using logistic regression, the inputted factors require numerial. Therefore, we can do some feature factorization by using pd.get_dummies() function.

In [None]:
dummies_Cabin = pd.get_dummies(data_train.Cabin, prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train.Embarked, prefix='Embarked')
dummies_Pclass = pd.get_dummies(data_train.Pclass, prefix='Pclass')
dummies_Sex = pd.get_dummies(data_train.Sex, prefix='Sex')
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Pclass, dummies_Sex], axis=1)
df.drop(['Name','Cabin', 'Embarked', 'Pclass', 'Sex', 'Ticket'], axis = 1, inplace=True)
df.head(10)

We can see that the values of 'Age' and 'Fare' have a great variation. Therefore, we can do some scaling first by using preprocessing in scikit-learn.

In [None]:
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_parm = scaler.fit(df['Age'].values.reshape(-1,1))
df['scaled_Age'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_parm)
fare_scale_parm = scaler.fit(df['Fare'].values.reshape(-1,1))
df['scaled_Fare'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_parm)
df.head()

In [None]:
from sklearn import linear_model
train_df = df.filter(regex='Survived|scaled_.*|SibSp|Parch|Cabin_.*|Embarked_.*|Pclass_.*|Sex_.*')
train_np = train_df.values
y = train_np[:,0]
x = train_np[:,1:]
clf = linear_model.LogisticRegression(solver='liblinear',C=1.0, penalty='l1', tol=1e-6)
clf.fit(x,y)

In [None]:
data_test = pd.read_csv("../input/titanic/test.csv",engine = 'python')
data_test.head()

In [None]:
data_test.info()

In [None]:
data_test.loc[data_test['Fare'].isnull(), 'Fare'] = 0
temp_df = data_test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
# fit the unknown age
null_age = temp_df[temp_df['Age'].isnull()].values
X = null_age[:,1::]
predicted_age = rfr.predict(X)
data_test.loc[data_test['Age'].isnull(), 'Age'] = predicted_age
# reset the cabin values
data_test = set_missing_cabin(data_test)
# convert the categorical data into numerial 
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix='Pclass')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix='Sex')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Pclass, dummies_Sex], axis=1)
df_test.drop(['Name','Cabin', 'Embarked', 'Pclass', 'Sex', 'Ticket'], axis=1, inplace=True)
df_test['scaled_Age'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1), age_scale_parm)
df_test['scaled_Fare'] = scaler.fit_transform(df_test['Fare'].values.reshape(-1,1), fare_scale_parm)
df_test.head()

In [None]:
test = df_test.filter(regex='scaled_.*|SibSp|Parch|Cabin_.*|Embarked_.*|Pclass_.*|Sex_.*')
pred = clf.predict(test)
test = pd.DataFrame({'PassengerId': data_test['PassengerId'].values, 'Survived': pred.astype(np.int32)})
test.to_csv("logistic_regression_predictions.csv", index=False)
pd.read_csv("logistic_regression_predictions.csv").head()

In [None]:
# from sklearn import cross_validation
from sklearn.model_selection import cross_val_score, train_test_split
clf = linear_model.LogisticRegression(solver='liblinear',C=1.0, penalty='l1', tol=1e-6)
all_data = df.filter(regex='Survived|scaled_.*|SibSp|Parch|Cabin_.*|Embarked_.*|Pclass_.*|Sex_.*')
x = all_data.values[:,1:]
y = all_data.values[:,0]
print(cross_val_score(clf, x, y, cv=5))
