#Libraries

---



In [56]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import seaborn as sns


#Import and Load Data

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

#Exploratory Data Analysis (EDA)

Data Overview

- - -

In [None]:
train_data.describe()

Examine the statistics of the numerical variables reveals the following observations :


*   The "Age" variable exhibits normal values, with acceptable minimum and maximum ranges, seemingly free from outliers (tho this will be further investigated in the outliers analysis)

*   "SibSp" and "Parch" values raise some concerns. It appears unusual to have 8 siblings or 6 children on the trip when most passengers have 0.

*    The "Fare" values will be examined during the outliers step, as the maximum value appears excessively large.

*    Upon examining the counts, it becomes evident that there are missing values for the "Age" variable, which will be addressed in the missing values step.

Analyze the correlation between the variables
- - -

In [None]:
cmap=sns.cubehelix_palette(start=2)
mask = np.triu(np.ones_like(train_data.corr(), dtype=bool))

plt.figure(figsize=(10,5))
sns.heatmap(train_data.corr(), vmin=-1, vmax=1, annot=True, mask=mask, cmap=cmap);

Most of them have a correlation not over than 30%. Only two pairs with a correlation more than 30%

Missing Values - Train Data


---



Searching for missing values

In [None]:
train_data.count()

Confirming missing values

In [None]:
train_data.isna().sum()

To address the issue of missing values, the following steps will be taken :


*   The missing values for the "Age" will be replaced with the median since it's a numeric variable and less than 25% of the data is missing.

*   The "Cabin" row will be droppped since it has such a high proportion of missing values.

*   For the "Embarked" variable, if there are no missing values in the "test_data" set, rows with missing "Embarked" values in the "train_data" set will be removed. This is because "Embarked" is a categorical variable, and while it is possible to infer the missing values by analyzing other variables, the small number of missing cases (just 2 out of 891) makes this process not worth the time and effort.






In [65]:
test_data["Embarked"].count()

418

Deal with missing values

In [66]:
# Replace "Age" missing values by "Age" Median
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].median())

In [None]:
# Remove rows with "Embarked" missing values
train_data[train_data["Embarked"].isna()]

In [68]:
train_data.dropna(subset = ["Embarked"], how = "all", inplace = True)

Missing Values - Test Data
- - -

Searching for missing values

In [None]:
test_data.count()

Deal with missing values

In [70]:
# Replace "Age" missing values by "Age" median
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].median())

In [71]:
# Replace "Fare" missing values by "Fare" median (for the same reason as changing "Age" missing values)
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].median())

Outliers
- - -

In [None]:
fig = make_subplots(rows=4, cols=1,
                    subplot_titles=("Age", "Amount of Siblings / Spouses Aboard the Titanic (SibSp)", "Amount of Parents / Children Aboard the Titanic (Parch)", "Fare"))

fig.add_trace(go.Box(x=train_data["Age"]),
              row=1, col=1)

fig.add_trace(go.Box(x=train_data["SibSp"]),
              row=2, col=1)

fig.add_trace(go.Box(x=train_data["Parch"]),
              row=3, col=1)

fig.add_trace(go.Box(x=train_data["Fare"]),
              row=4, col=1)

fig.update_layout(height=500, width=1000, yaxis_visible=False, yaxis2_visible=False, yaxis3_visible=False, yaxis4_visible=False, showlegend=False)

fig.show()

I will conduct an analysis to identify the number of passengers with specific characteristics in both the "train_data" and "test_data" datasets:


*   "Age" over 66 years old
*   "SibSp" values over 5 (indicating more than 5 siblings/spouses aboard)
*   "Fare" values over 300.

Since I need to predict the survival of the 418 passengers in the "test_data" dataset, I cannot simply remove rows with these characteristics. Instead, I will handle them differently based on their prevalence. If there is a small number of passengers with these characteristics in both datasets, I will replace their values with the median. However, if a considerable number of passengers exhibit these characteristics, I will conduct a separate analysis to address them appropriately.

In [None]:
train_data["Age"][train_data["Age"] > 66].count()

In [None]:
train_data["SibSp"][train_data["SibSp"] > 5].count()

In [None]:
train_data["Fare"][train_data["Fare"] > 300].count()

In [None]:
test_data["Age"][test_data["Age"] > 66].count()

In [None]:
test_data["SibSp"][test_data["SibSp"] > 5].count()

In [None]:
test_data["Fare"][test_data["Fare"] > 300].count()

Deal with Outliers - Train Data

In [79]:
median_age = train_data["Age"].median()
train_data["Age"] = np.where(train_data["Age"] > 66, median_age,train_data["Age"])

In [80]:
median_sibsp = train_data["SibSp"].median()
train_data["SibSp"] = np.where(train_data["SibSp"] > 5, median_sibsp,train_data["SibSp"])

In [81]:
median_fare = train_data["Fare"].median()
train_data["Fare"] = np.where(train_data["Fare"] > 300, median_fare,train_data["Fare"])

Deal with Outliers - Test Data

In [82]:
median_age = test_data["Age"].median()
test_data["Age"] = np.where(test_data["Age"] > 66, median_age,test_data["Age"])

In [83]:
median_sibsp = test_data["SibSp"].median()
test_data["SibSp"] = np.where(test_data["SibSp"] > 5, median_sibsp,test_data["SibSp"])

In [84]:
median_fare = test_data["Fare"].median()
test_data["Fare"] = np.where(test_data["Fare"] > 300, median_fare,test_data["Fare"])

Transforming Variables
- - -

Embarked Variable - Train Data

In [85]:
train_data["Port_Embarkation"] = np.where(train_data["Embarked"].astype(str).str[0] == "C", "C",
                                 np.where(train_data["Embarked"].astype(str).str[0] == "Q", "Q",
                                 np.where(train_data["Embarked"].astype(str).str[0] == "S", "S",
                                 "NaN")))

Embarked Variable - Test Data

In [86]:
test_data["Port_Embarkation"] = np.where(test_data["Embarked"].astype(str).str[0] == "C", "C",
                                 np.where(test_data["Embarked"].astype(str).str[0] == "Q", "Q",
                                 np.where(test_data["Embarked"].astype(str).str[0] == "S", "S",
                                 "NaN")))

#Modeling

Features to Consider
- - -

In [87]:
features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Port_Embarkation"]

train = train_data[features]
test = test_data[features]

Transform categorical variables into numeric variables
- - -

In [88]:
X_train = pd.get_dummies(train)
y_train = train_data["Survived"]

X_test = pd.get_dummies(test)

Run some models to choose the best one
- - -

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=1)

models = [LogisticRegression(random_state=20),
          DecisionTreeClassifier(random_state=20),
          KNeighborsClassifier(n_neighbors=5),
          RandomForestClassifier(random_state=20),
          xgb.XGBClassifier(random_state=20)]

names = ["LogisticRegression", "Decision Tree", "K Neighbors","Random Forest","XGBoost"]

for model, name in zip(models, names):
    print(name)
    for score in ["accuracy", "precision", "neg_mean_squared_error"]:
        result = cross_val_score(model, X_train, y_train, scoring=score, cv=cv)
        print(score,': %.4f (%.3f)' % (np.mean(result), np.std(result)))
    print("\r\n")

Best model is XGBoost

Applying the best model - XGBoost

In [None]:
best_model = xgb.XGBClassifier(verbosity = 3, max_depth = 3, booster = "gbtree", random_state=20)
best_model.fit(X_train, y_train)
y_predict = best_model.predict(X_test)

#Analize the result

In [None]:
results = test_data
results["Survives_Predict"] = pd.DataFrame(y_predict)
results.describe

Output in .csv File
- - -

In [92]:
df = pd.DataFrame({"PassengerId": test_data.PassengerId,
                   "Survived": y_predict
                   })
df.to_csv("submission.csv", index=False)