# **TP Titanic**

In [379]:
# Imports
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import neighbors

In [380]:
# Opening CSV
data = pd.read_csv(r'../data/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [381]:
# Cleaning
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
# Dummies for Setsos
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)
mean_age = (data['Age']).mean()
data.fillna(mean_age, inplace=True)
# Result
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,0,3,22.0,1,0,7.25,1
1,1,1,38.0,1,0,71.2833,0
2,1,3,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,0,3,35.0,0,0,8.05,1


In [382]:
explicativeVariables = data.drop('Survived', axis=1)
explicativeVariables.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [383]:
targetVariable = data['Survived']
targetVariable.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [384]:
# Function for graphing
def graph(x, y, survived, xLabel, yLabel):
    plt.scatter(x, y, c=np.array(survived))
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.colorbar()
    plt.show()

In [385]:
def graphRelationBetweenTwoVariables(explicativeVariables, targetVariable):
    for i in range(0, explicativeVariables.columns.size - 1):
        for j in range(i + 1, explicativeVariables.columns.size):
            columnI = explicativeVariables.columns[i]
            columnJ = explicativeVariables.columns[j]
            graph(explicativeVariables[columnI], explicativeVariables[columnJ], targetVariable, columnI, columnJ)

In [386]:
#graphRelationBetweenTwoVariables(explicativeVariables, targetVariable)

# _**Exprópiese**_

In [387]:
# def createLogisticModel(x, y):
#     if 'const' in x.columns:
#         x = x.drop('const', axis=1)

#     return LogisticRegression().fit(x, y)

In [388]:
# def simpleTreeModel(X, y):
#     if 'const' in X.columns:
#         Xp = X.drop('const', axis=1)
#     else:
#         Xp = X
    
#     model = tree.DecisionTreeClassifier(min_samples_leaf=5).fit(Xp, y)

#     return model

In [389]:
# def randomForestModel(X, y):
#     if 'const' in X.columns:
#         Xp = X.drop('const', axis=1)
#     else:
#         Xp = X
    
#     model = RandomForestClassifier(min_samples_leaf=5, n_estimators=100).fit(Xp, y)

#     return model

In [390]:
def mse(y_real, y_pred):
    return (np.power(y_real - y_pred, 2)).mean()

In [391]:
def calculateAccuracy(y_real, y_pred):
    return 1 - mse(y_real, y_pred)

In [392]:
def set_up_predictions(y_proba, t=0.5):
    y_pred = []
    for proba in y_proba:
        pred = 0
        if proba[1] > t:
            pred = 1
        
        y_pred.append(pred)
    return y_pred

In [393]:
# VER DE ADAPTAR EL KFOLD PARA ESTO
def kFoldValidation(x, y, model, k=5, t=0.5):
    mseArray = []
    for i in range(0, int(k)):
        xTest = x[i * math.floor(len(x) / k) : (i + 1) * math.floor(len(x) / k)]
        yTest = y[i * math.floor(len(y) / k) : (i + 1) * math.floor(len(y) / k)]
        xTrain = pd.concat([x[0: i * math.floor(len(x) / k)], x[(i + 1) * math.floor(len(x) / k) :]])
        yTrain = pd.concat([y[0: i * math.floor(len(y) / k)], y[(i + 1) * math.floor(len(y) / k) :]])
        model = model.fit(xTrain.reset_index(), yTrain.reset_index())
        yPredProb = model.predict_proba(xTest.reset_index())
        yPred = set_up_predictions(yPredProb.reset_index(), t)
        mse_res = mse(yTest.reset_index(), yPred.reset_index())
        mseArray.append(mse_res)
        print(mse_res)
    final = np.array(mseArray)
    return final.mean()

In [394]:
from sklearn.neighbors import KNeighborsClassifier


def determineModel(x, y):
    models = [
        LogisticRegression(), 
        tree.DecisionTreeClassifier(min_samples_leaf=5),
        RandomForestClassifier(min_samples_leaf=5, n_estimators=100),
        KNeighborsClassifier()
    ]
    modelMse = []
    modelT = []
    for model in models:
        model_best_kfold = np.Infinity
        best_t = 0
        for t in np.arange(0, 1, 0.1):
            res = kFoldValidation(x, y, model, t)
            if res < model_best_kfold:
                model_best_kfold = res
                best_t = t
        modelMse.append(model_best_kfold)
        modelT.append(best_t)
    best_model_index = modelMse.index(min(modelMse))
    model = models[best_model_index]
    model_t = modelT[best_model_index]
    print(modelMse)
    return model, model_t

In [395]:
model, th = determineModel(explicativeVariables, targetVariable)
testData = pd.read_csv(r'../data/test.csv')
passId = testData['PassengerId']
testData = testData.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

# Dummies for Sex
testData = pd.get_dummies(testData, columns=['Sex'], drop_first=True)
testData.fillna(mean_age, inplace=True)
# Result
model = model.fit(explicativeVariables, targetVariable)
predictionProba = model.predict_proba(testData)
prediction = set_up_predictions(predictionProba, th)
prediction
finalData = testData
finalData.insert(len(finalData.columns), 'Survived', prediction)
finalData.insert(0, 'PassengerId', passId)
finalData.to_csv('resultados.csv', index = True)
print(th)

[98989898, 98989898, 98989898, 98989898]
0


  return final.mean()
  ret = ret.dtype.type(ret / rcount)


# _**Expropiado**_