In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()

In [None]:
women = train.loc[train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [None]:
men = train.loc[train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [None]:
train.describe()

## Check which kind of type has each field

In [None]:
train.dtypes

## Find null values into the dataset

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Overview of each column

In [None]:
for j in train.columns:
    print(j)
    print(train[j].value_counts())

### Behaviour between class and age 

In [None]:
import seaborn as sns
import matplotlib.pyplot  as plt

plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='rainbow')

#If we take a look at class = 1, we can deduce that the avg of age which belongs to that class, is 38 aprox. For class 2, we have an avg of 29, and for the last class, is around 25

In [None]:
train['Age'] = train.apply(lambda x:25 if (pd.isna(x['Age']) and x['Pclass']==3) 
                                    else (29 if (pd.isna(x['Age']) and x['Pclass']==2) 
                                      else (39 if (pd.isna(x['Age']) and x['Pclass']==1) 
                                        else x['Age'])),axis=1)

In [None]:
test['Age'] = test.apply(lambda x:25 if (pd.isna(x['Age']) and x['Pclass']==3) 
                                    else (29 if (pd.isna(x['Age']) and x['Pclass']==2) 
                                      else (39 if (pd.isna(x['Age']) and x['Pclass']==1) 
                                        else x['Age'])),axis=1)

In this case, we use the mode to replace the null values

In [None]:
train['Embarked']=train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Embarked']=test['Embarked'].fillna(test['Embarked'].mode()[0])

For Fare , we are going to use the mean 

In [None]:
test['Fare']=test['Fare'].fillna(test['Fare'].mean())

To avoid have categorical values, we replace the letters for integers

In [None]:
train.Embarked.replace(('S','C','Q'),
                      (1,2,3),inplace=True)

test.Embarked.replace(('S','C','Q'),
                      (1,2,3),inplace=True)

In [None]:
train['Sex'] = train['Sex'].map( {'male': 1, 'female': 0} ).astype(int)
test['Sex'] = test['Sex'].map( {'male': 1, 'female': 0} ).astype(int)

In [None]:
y = train["Survived"]

In [None]:
features = ["Pclass", "Sex", "SibSp", "Parch","Age"]

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)  #random forest
model

In [None]:
from sklearn.model_selection import GridSearchCV

grid_param = {
    'n_estimators': [300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [None]:
model_grid = GridSearchCV(estimator=model,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [None]:
X = train[features]

model_grid.fit(X, y)

In [None]:
print(model_grid.best_params_)

In [None]:
print(model_grid.best_score_)

## Training model

In [None]:
final_model = RandomForestClassifier(bootstrap=True, criterion='entropy',n_estimators=800)  #we use the parameters obtained in the previous step
final_model.fit(X,y)

In [None]:
X_test = test[features]

## Predictions

In [None]:
predictions = final_model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")