In [206]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [207]:
train_df=pd.read_csv('/kaggle/input/titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')
train_df.head()

In [208]:
val_df=pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
val_df.head()

In [209]:
train_len = len(train_df)
dataset = pd.concat(objs=[train_df, test_df], axis=0).reset_index(drop=True)

In [210]:
dataset.isnull().sum()

In [211]:
train_df.info()

In [212]:
test_df.info()

# **Data Analysis**

## Sex

In [213]:
train_df['Sex'].isnull().sum()

In [214]:
women = train_df.loc[train_df.Sex=='female']["Survived"]
men = train_df.loc[train_df.Sex=='male']["Survived"]
surv_women =sum(women)/len(women)
surv_men =sum(men)/len(men)
print('Percentage of women who survived: ', surv_women, '\nPercentage of men who survived: ', surv_men)

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
classes=['men', 'women']
surv=[sum(men), sum(women)]
onboard=[len(men), len(women)]
ax.bar(classes, onboard, color='r')
ax.bar(classes, surv, color='b')
ax.legend(labels=['total', 'survived'])
plt.show

In [215]:
dataset.head()

In [216]:
dataset['Sex']=dataset['Sex'].map({'male':0, "female":1})
dataset.head()

## Class


In [217]:
dataset.head()

In [218]:
dataset['Pclass'].isnull().sum()

In [219]:
first = train_df.loc[train_df.Pclass==1]["Survived"]
second = train_df.loc[train_df.Pclass==2]["Survived"]
third = train_df.loc[train_df.Pclass==3]["Survived"]
surv_first=sum(first)/len(first)
surv_sec=sum(second)/len(second)
surv_thi=sum(third)/len(third)

print('First: ', surv_first, '\nSecond: ', surv_sec, '\nThird: ', surv_thi)

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
classes=['first', 'second', 'third']
surv=[sum(first), sum(second), sum(third)]
onboard=[len(first), len(second), len(third)]
ax.bar(classes, onboard, color='r')
ax.bar(classes, surv, color='b')
ax.legend(labels=['total', 'survived'])
plt.show

## Fare

In [220]:
dataset['Fare'].isnull().sum()

In [221]:
# NaN Wert mit Median füllen
dataset['Fare']=dataset['Fare'].fillna(value=dataset['Fare'].median())

In [222]:
dataset['Fare'].isnull().sum()

In [223]:
fare = train_df[['Fare', 'Survived']]
fare_surv = fare[fare.Survived==0]['Fare']

fig,ax=plt.subplots(1,1)
ax.hist(fare['Fare'], bins=10, range=(0,80))
ax.hist(fare_surv, bins=10, range=(0, 80))
ax.legend(labels=['total', 'survived'])
plt.show()

In [224]:
train_df.plot(kind='scatter', x='Pclass', y='Fare')
plt.show

## Family

In [225]:
dataset['SibSp'].isnull().sum(), dataset['Parch'].isnull().sum()

In [226]:
dataset['Fam_size']=dataset['SibSp']+dataset['Parch']+1

In [227]:
dataset.describe()

In [228]:
g = sns.catplot(x="Fam_size",y="Survived",data = dataset, kind='point')
g = g.set_ylabels("Survival Probability")

## Embarked

In [229]:
dataset['Embarked'].isnull().sum()

In [230]:
#count S, C, Q
shape=dataset.pivot_table(index=['Embarked'], aggfunc='size')
shape

In [231]:
# Fill the empty ones in the most freuqent Class
dataset['Embarked']=dataset['Embarked'].fillna(value='S')

In [232]:
g = sns.catplot(x="Embarked",y="Survived",data = dataset, kind='point')
g = g.set_ylabels("Survival Probability")

In [233]:
dataset.head()

In [234]:
#Map the whole Dataset
dataset['Embarked']=dataset['Embarked'].map({'S':0, 'C':1, 'Q':2})

In [235]:
dataset.head()


## Cabin

In [236]:
dataset['Cabin'].isnull().sum()

In [237]:
dataset['Cabin'].dropna().head(10)

In [238]:
dataset['Cabin']=dataset['Cabin'].fillna(value='X')

In [239]:
dataset['Cabin_letter']=dataset['Cabin'].astype(str).str[0]

In [240]:
dataset.head()

In [241]:
dataset['Cabin_letter']=dataset['Cabin_letter'].map({'A':0, 'B':1, 'C':2, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'X':9})

In [242]:
g = sns.catplot(x="Cabin_letter",y="Survived",data = dataset, kind='point')
g = g.set_ylabels("Survival Probability")

## Title

In [243]:
dataset['Name'].isnull().sum()

In [244]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()

In [245]:
g=sns.countplot(x='Title', data=dataset)

In [246]:
dataset['Title']= dataset['Title'].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Others')

In [247]:
g=sns.countplot(x='Title', data=dataset)

In [248]:
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 2 , "Mme":3, "Mlle":4, "Mrs":5, "Mr":6, "Others":7})
dataset["Title"] = dataset["Title"].astype(int)

In [249]:
g=sns.countplot(x='Title', data=dataset)

In [250]:
g = sns.catplot(x="Title",y="Survived",data = dataset, kind='point')
g = g.set_ylabels("Survival Probability")

In [251]:
dataset.head()

## Age

In [252]:
dataset['Age'].isnull().sum()

In [253]:
age = train_df[['Age', 'Survived']]
age_surv = age[age.Survived==0]['Age']

fig,ax=plt.subplots(1,1)
ax.hist(age['Age'], bins=10, range=(0,80))
ax.hist(age_surv, bins=10, range=(0, 80))
ax.legend(labels=['total', 'survived'])
plt.show()

In [254]:
dataset.head()

In [255]:
# Filling missing value of Age 

## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['Pclass'] == dataset.iloc[i]["Pclass"]) & (dataset['Title'] == dataset.iloc[i]["Title"]) & (dataset['Fam_size'] == dataset.iloc[i]["Fam_size"]))].median()
    if not np.isnan(age_pred) :
        dataset['Age'].iloc[i] = age_pred
    else :
        dataset['Age'].iloc[i] = age_med

In [256]:
#heatmap
map=sns.heatmap(dataset[['Survived', 'Age', 'Pclass','Embarked','Sex', 'Title','Cabin_letter', 'Fare', 'Fam_size']].corr(), annot=True)

In [257]:
#heatmap
map=sns.heatmap(dataset.corr(), annot=True)

## Drop stuff

In [258]:
dataset.head()

In [259]:
dataset.drop(labels=["Name", "SibSp", 'Parch', 'Ticket', 'Cabin'],axis = 1,inplace=True)

In [260]:
#dataset['Survived']=dataset['Survived'].astype(int)

In [261]:
dataset.head()

In [262]:
#dataset['Survived']=dataset['Survived'].astype(pd.Int64Dtype())

In [263]:
dataset.head()



# **Prediction Models**

## Train & Test Data handling

In [264]:
train=dataset[:train_len]
test=dataset[train_len:]
#Y_test1=test['Survived'].copy(deep=True)
test.drop(labels=["Survived"],axis = 1,inplace=True)
Y_train=train['Survived']
features=['PassengerId','Pclass','Sex','Age','Fare','Embarked','Fam_size','Cabin_letter','Title']
X_train=pd.get_dummies(train[features])
X_test=pd.get_dummies(test[features])

In [265]:
Y_train.head()

In [266]:
Y_train.head()

In [267]:
X_test.head()

## Models
### Random Forest Classifier

In [268]:

train_split_X, val_X, train_split_Y, val_Y = train_test_split(X_train, Y_train, random_state = 0)

#Train
model=RandomForestClassifier(n_estimators=100)
model.fit(train_split_X, train_split_Y)


#Predict
pred_Y=model.predict(val_X)
score=metrics.accuracy_score(val_Y, pred_Y)
print("Avg accuracy: {}".format(score.mean()))

In [269]:
#Train
model=RandomForestClassifier(n_estimators=10)
model.fit(X_train, Y_train)


#Predict and Submit
Y_pred_rcf = model.predict(X_train)
train_score_rfc = accuracy_score(Y_pred_rcf, Y_train)
print("score on train data: ", train_score_rfc)

#output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': Y_pred})
#output.to_csv('first_submission.csv', index=False)
#print("Your submission was successfully saved!")

### K-Fold Random Forest

In [270]:

#Train with KFold cross validaion
k = 21
kf = KFold(n_splits=k, random_state=None)
model = RandomForestClassifier(n_estimators=100)
score = cross_val_score(model , X_train, Y_train, cv = kf)
 
print("Avg accuracy: {}".format(score.mean()))

### Neural Net

In [299]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

X_scaled = scaler.transform(X_train)


X_train_split, X_val, Y_train_split, Y_val = train_test_split(X_scaled, Y_train, random_state = 0)


seed=42
clf  =  MLPClassifier(solver='adam', 
                    alpha=0.0001,
                    activation='tanh',
                    learning_rate='adaptive',
                    hidden_layer_sizes=(50, 50), 
                    random_state=seed,
                    max_iter=10000,
                    early_stopping=True,
                    n_iter_no_change=2000)
clf.fit(X_train_split, Y_train_split)

pred_train_mlp = clf.predict(X_train_split)
pred_test_mlp = clf.predict(X_val)
train_score = accuracy_score(pred_train_mlp, Y_train_split)
print("score on train data: ", train_score)
test_score = accuracy_score(pred_test_mlp, Y_val)
print("score on val data: ", test_score)
print('number of iterations: ', clf.n_iter_)



In [272]:
print(classification_report(Y_train_split, pred_train_mlp))

In [300]:
plt.plot(clf.loss_curve_)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show

### Hyperparamter search with grid

In [274]:

param_grid = {
    'hidden_layer_sizes': [(20, 20, 20), (25, 25), (47), (10, 10, 10, 10)],
    'max_iter': [100, 150, 200, 300],
    'activation': ['tanh', 'relu'],
    'solver': ['lbfgs', 'adam'],
    'alpha': [0.0001, 0.0003],
    'learning_rate': ['constant','adaptive'],
}
grid = GridSearchCV(clf, param_grid, n_jobs= -1, cv=5)
grid.fit(X_train_split, Y_train_split)

print(grid.best_params_) 


In [275]:

grid_predictions = grid.predict(X_val) 

print('Accuracy: {:.2f}'.format(accuracy_score(Y_val, grid_predictions)))


### Try a voting classifier with MLP's

In [276]:
seed=7
kfold=model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
est_nn=[]

clf1  =  MLPClassifier(solver='adam', 
                    alpha=0.0005,
                    activation='relu',
                    learning_rate='adaptive',
                    hidden_layer_sizes=(47), 
                    random_state=None,
                    max_iter=2000,
                    early_stopping=True, 
                    n_iter_no_change=200)
#clf1.fit(X_train_split, Y_train_split)
est_nn.append(('clf1', clf1))

clf2  =  MLPClassifier(solver='adam', 
                    alpha=0.0003,
                    activation='relu',
                    learning_rate='adaptive',
                    hidden_layer_sizes=(47), 
                    random_state=None,
                    max_iter=2000,
                    early_stopping=True, 
                    n_iter_no_change=200)
#clf2.fit(X_train_split, Y_train_split)
est_nn.append(('clf2', clf2))

clf3  =  MLPClassifier(solver='adam', 
                    alpha=0.0001,
                    activation='relu',
                    learning_rate='adaptive',
                    hidden_layer_sizes=(47), 
                    random_state=None,
                    max_iter=2000,
                    early_stopping=True, 
                    n_iter_no_change=200)
#clf3.fit(X_train_split, Y_train_split)
est_nn.append(('clf3', clf3))


pred_train_mlp = clf.predict(X_train_split)
pred_test_mlp = clf.predict(X_val)
train_score = accuracy_score(pred_train_mlp, Y_train_split)
print("score on train data: ", train_score)
test_score = accuracy_score(pred_test_mlp, Y_val)
print("score on val data: ", test_score)
print('number of iterations: ', clf.n_iter_)


eclf = VotingClassifier(est_nn)

results = model_selection.cross_val_score(eclf, X_train_split, Y_train_split, cv=kfold)
print(results.mean())

eclf=eclf.fit(X_train_split, Y_train_split)

pred_train_eclf = eclf.predict(X_train_split)
pred_val_eclf = eclf.predict(X_val)
train_score_eclf = accuracy_score(pred_train_eclf, Y_train_split)
print("score on train data: ", train_score_eclf)
test_score_eclf = accuracy_score(pred_val_eclf, Y_val)
print("score on val data: ", test_score_eclf)

### Output MLP'S

In [287]:
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
seed=42
clf_end  =  MLPClassifier(solver='adam', 
                    alpha=0.0001,
                    activation='tanh',
                    learning_rate='adaptive',
                    hidden_layer_sizes=(50), 
                    random_state=seed,
                    early_stopping=True,
                    max_iter=1000)
clf_end.fit(X_scaled, Y_train)

pred_train_mlp_end = clf_end.predict(X_test)
pred_train_mlp_end=pred_train_mlp_end.astype(int)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': pred_train_mlp_end})
output.to_csv('submission.csv', index=False)
print('submission completed')

**Keras**

### Boosting

In [290]:
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import preprocessing

In [286]:


scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

X_train_split, X_val, Y_train_split, Y_val = train_test_split(X_train, Y_train, random_state = 0)


seed=7
kfold=model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)

#Different models
estimators=[]
model1=LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))
model2=DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(max_iter=1000)
estimators.append(('svm', model3))

# create the ensemble
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X_train_split, Y_train_split, cv=kfold)
print(results.mean())


ensemble=ensemble.fit(X_train_split, Y_train_split)

pred_train_boost = ensemble.predict(X_train_split)
pred_val_boost = ensemble.predict(X_val)
train_score_eclf = accuracy_score(pred_train_boost, Y_train_split)
print("score on train data: ", train_score_eclf)
test_score_eclf = accuracy_score(pred_val_boost, Y_val)
print("score on val data: ", test_score_eclf)



### Output Boosting

In [289]:


scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)


seed=7
kfold=model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)

#Different models
estimators=[]
model1=LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))
model2=DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(max_iter=2000)
estimators.append(('svm', model3))

# create the ensemble
boost_out = VotingClassifier(estimators)
#results = model_selection.cross_val_score(ensemble, X_train_split, Y_train_split, cv=kfold)
#print(results.mean())


boost_out=boost_out.fit(X_scaled, Y_train)
pred_out_boost = boost_out.predict(X_test)
pred_out_boost=pred_out_boost.astype(int)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': pred_out_boost})
output.to_csv('submission.csv', index=False)
print('submission completed')
