In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

def corr_heatmap( df, cols=None, figsize=(14,7)):
    fig, ax = plt.subplots(figsize=figsize)
    corr = df.loc[:,cols].corr() if cols!=None else df.corr()
    sns.heatmap( corr , annot=True, ax=ax)
    plt.show()
    
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
print('train.csv have PassengerIds between {} and {}'.format( train.PassengerId[0], train.PassengerId.values[-1] ))
print('test.csv (for submission) have PassengerIds between {} and {}'.format( test.PassengerId[0], test.PassengerId.values[-1] ))

y_data = train['Survived'].copy() #train.pop('Survived')
x_data = pd.concat([train,test], axis=0)
x_data = x_data.reset_index(drop=True)

print('\nThe data that we\'ll use for train is a merge between train/test. So the shape of our x_data is {}.'.format(x_data.shape))

### ***Data Analysis and Treatment:***

In [None]:
x_data.info()

In [None]:
x_data = x_data.drop(['PassengerId'], axis=1)

# The pclass column is related to "Ticket class", so its necessary to treated as categorical.
x_data.Pclass = x_data.Pclass.astype(object)

In [None]:
nulls = x_data.isna().sum().to_dict()

for col in nulls:
    if nulls[col]>0:
        print(f"{nulls[col]}\t{x_data[col].dtype}\t\t{col}")

In [None]:
# for 'Fare' and 'Age' columns we'll change nan by mean values 
x_data.Age = x_data.Age.fillna(x_data.Age.mean())
x_data.Fare = x_data.Fare.fillna(x_data.Fare.mean())

# on 'Embarked' column we'll use the most repeated value(mode)
x_data.Embarked.fillna( 'UNK', inplace=True)

# for null values on Cabin column, the label "None" will be adopted
x_data.Cabin.fillna('_', inplace=True)
x_data.isna().sum().to_dict()

### ***Feature Engineering:***

In [None]:
x_data['Individual_Fare'] = np.log1p(x_data.Fare/(1+ x_data.SibSp + x_data.Parch ))

In [None]:
from collections import Counter
import statistics as stats

def clean_textname(field:pd.Series):
    chars = [',','.',',','(',')','-','"','Miss','Mrs','Mr','Master']
    for c in chars: field = field.apply(lambda x: str(x).replace(c,''))
    return field
    
def calc_probabilities(dataframe: pd.DataFrame, survived:int):
    selected_names = dataframe.loc[ dataframe.Survived==survived ].Name
    names = []
    for item in selected_names.values:
        word = item.split(' ')
        try: 
            word.remove('')
            word.remove(' ')
        except: pass
        names += word
        
    names = dict(zip( Counter(names).keys(),Counter(names).values() )) 
    probs = pd.DataFrame(names, index=[str(survived)]).T
    count = probs[str(survived)].count()
    return probs.apply(lambda x: x/count) if survived == 1 else probs.apply(lambda x: -1*x/count)
 

x_data.Name = clean_textname(x_data.Name)

name_probability = pd.concat([calc_probabilities(x_data,0),calc_probabilities(x_data,1)], axis=1).fillna(0)
name_probability['score'] = name_probability.apply(lambda x: 100*x.sum(), axis=1)
name_probability = name_probability.score.to_dict()

name_score = []
for name in x_data.Name:
    
    try: _max = max([ name_probability[word] if len(word)>0 else -np.inf for word in name.split(' ') ])
    except: _max = 0
    
    try: _mean = stats.mean([ name_probability[word] if len(word)>0 else 0 for word in name.split(' ') ])
    except: _mean = 0
    
    try: _sum = sum([ name_probability[word] if len(word)>0 else 0 for word in name.split(' ') ])
    except: _sum = 0
        
    name_score.append([_max, _mean, _sum])

new_columns = pd.DataFrame( name_score , columns=['Name_Score_Max','Name_Score_Mean','Name_Score_Sum'])
x_data = pd.concat([x_data, new_columns], axis=1)

x_data = x_data.drop(['Name', 'Ticket','Cabin'],axis=1)

In [None]:
corr_heatmap( pd.get_dummies(x_data) ,figsize=(20,10))

In [None]:
x_data.hist(figsize=(20,10))
plt.show()

In [None]:
#MinMax: SibSp, Parch, Fare
#StdSc:  Age, Individual_Fare, Name_Score_Max, Name_Score_Mean, Name_Score_Sum, Name_Score_Max

### ***Split, Encoding and Feature Scaling:***

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

x_data_scaled = pd.get_dummies(x_data)
 
_minmax = ['SibSp', 'Parch', 'Fare']
scaler_1 = MinMaxScaler()
x_data_scaled[_minmax] = pd.DataFrame( scaler_1.fit_transform(x_data_scaled[_minmax]), columns=_minmax)

_stdsc =['Age', 'Individual_Fare', 'Name_Score_Max', 'Name_Score_Mean', 'Name_Score_Sum']
scaler_2 = StandardScaler()
x_data_scaled[_stdsc] = pd.DataFrame( scaler_2.fit_transform(x_data_scaled[_stdsc]), columns=_stdsc)

x_data_scaled

On last line(Survived) we observe the correlation with other features. Embarked_Q is the most uncorrelated with our label Survived.

In [None]:
x_data_scaled = x_data_scaled.drop(['Embarked_Q','Survived','Name_Score_Sum','Name_Score_Max'],axis=1)

In [None]:
x_submission = x_data_scaled.loc[891:]
x_datatrain = x_data_scaled.loc[y_data.index,:]

### ***Model Selection:***

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import setup, compare_models

pycaret_data = pd.concat([ x_datatrain, y_data], axis=1)
_ = setup( data=pycaret_data , target='Survived', session_id=0, train_size=0.7, fold=20, fold_shuffle=True, preprocess=False, verbose=1)
models_set = compare_models(n_select=5)

#### ***Models based on 5 top scored of pycaret selection***

In [None]:
for i,model in enumerate(models_set):
    m = str(model.__class__).split('.')[-1][:-2]
    try: model.fit(x_datatrain, y_data, verbose=0)
    except: model.fit(x_datatrain, y_data)
        
    y_pred = model.predict(x_submission)
    submission = pd.DataFrame(y_pred, columns=['Survived'], index=range(892,1310))
    submission.index.name = 'PassengerId'
    submission.to_csv(f'model_{m}.csv')
    print(f'{i+1} - Model {m} saved!')

---

#### ***Model based on Bagging***

In [None]:
bagging = {}

for model in models_set:
    model_name = str(model.__class__).split('.')[-1][:-2]
    bagging[model_name] = model.predict(x_submission)

bagging = pd.DataFrame(bagging)

weight = [1,1,1,1,1]
total = sum(weight)

Bag = bagging.apply(lambda x: round(sum(map(lambda a,b: a*b, x, weight))/total), axis=1 )

submission = pd.DataFrame(Bag.values, columns=['Survived'], index=range(892,1310))
submission.index.name = 'PassengerId'
submission.to_csv('bag_of_5models.csv')


-------

#### ***Model based on Neural Networks***

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as ly
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

feats = x_datatrain.shape[1]

layers = [ ly.Input(shape=( feats,)) ]
layers += [ ly.Dense(feats, activation='tanh'), ly.Dense(1, activation='sigmoid')]
#layers += [ ly.Dense(feats, activation='tanh') for _ in range(20)] + [ly.Dense(1, activation='sigmoid')]

model = tf.keras.Sequential(layers)
#model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
                  loss=tf.keras.losses.BinaryCrossentropy(), 
                  metrics=['accuracy'])

x = tf.constant(x_datatrain, dtype=tf.float32)
y = tf.constant(y_data, dtype=tf.float32)

history = model.fit( x, y, epochs=600, verbose=0, validation_split=0.2, callbacks=[
                        ReduceLROnPlateau(monitor='accuracy', patience=3, factor=0.25, verbose=1),
                        EarlyStopping(monitor='accuracy', patience=5, restore_best_weights=True)
                    ])

print('\n'), pd.DataFrame(history.history).plot()
print('\n'), model.evaluate(x,y)


In [None]:
submission = pd.DataFrame(np.round(model.predict(tf.constant(x_submission, dtype=tf.float32), verbose=0)), columns=['Survived'], index=range(892,1310))
submission.index.name = 'PassengerId'
submission.Survived = submission.Survived.astype(int)
submission.to_csv('model_ann_2.csv')

___