In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import sklearn.tree
import sklearn.ensemble

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data_shape = train_data.shape
#Split into training and validation data
valid_data = train_data.iloc[1 * (train_data_shape[0])//2 : (train_data_shape[0])-1]
train_data = train_data.iloc[0 : 1 * (train_data_shape[0])//2 - 1]
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
#Data cleaning and classification
def clean_data(data, num_classif):
    #Numericalize gender column
    data_numeric = data.replace('male', 0)
    data_numeric = data_numeric.replace('female', 1)

    #Numericalize class column
    data_numeric = data_numeric.replace('Q', 2)
    data_numeric = data_numeric.replace('S', 1)
    data_numeric = data_numeric.replace('C', 0)
    
    #Classify Age column into easy-to-use random forest generators
    #Generated through visualization of clusters
    ageRange = data_numeric["Age"].max() - data_numeric["Age"].min()
    def classifyAge(age):
        if not math.isnan(age):
            return int((age - data_numeric["Age"].min()) // (ageRange/num_classif))
        else:
            return age
    data_numeric["Age"] = data_numeric["Age"].apply(classifyAge) 

    #Classify fare-rangle columns through visualization of clusters
    fareRange = data_numeric["Fare"].max() - data_numeric["Fare"].min()
    def classifyFare(fare):
        if not math.isnan(fare):
            return int((fare - data_numeric["Fare"].min()) // (fareRange/num_classif))
        else:
            return fare
    data_numeric["Fare"] = data_numeric["Fare"].apply(classifyFare) 

    #Classify by Cabin row, better indicator than entire cabin number
    def getFirstChar(s):
        if type(s) is not float:
            s = s.split(" ")
            s = s[0]
            return int(ord(s[0:1]))
        else:
            return s
    data_numeric["CabinLetter"] = data_numeric["Cabin"].apply(getFirstChar)

    #Split into separate column for cabin number
    def getCabinNumber(s):
        if type(s) is not float:
            s = s.split(" ")
            s = s[0]
            try:
                return int(s[1:])
            except:
                return float('nan')
        else:
            return s
    
    #Classify cabin number into different ranges
    data_numeric["CabinNumber"] = data_numeric["Cabin"].apply(getCabinNumber)
    cabinNumberRange = data_numeric["CabinNumber"].max() - data_numeric["CabinNumber"].min()
    def classifyCabinNumber(cabinNumber):
        if not math.isnan(cabinNumber):
            return int((cabinNumber - data_numeric["CabinNumber"].min()) // (cabinNumberRange/num_classif))
        else:
            return cabinNumber
    data_numeric["CabinNumber"] = data_numeric["CabinNumber"].apply(classifyCabinNumber) 
    data_numeric = data_numeric.replace(float('nan'), int(num_classif + 1))
    return data_numeric


In [None]:
#Create model

#Best columns to decide on based on correlation visualization
decide_columns = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "CabinLetter", "CabinNumber"]

#Set hyperparameters and find error based on them and the validation data
def find_error(my_num_classif, my_max_depth, my_n_estimators):
    train_data_numeric = clean_data(train_data, my_num_classif)
    valid_data_numeric = clean_data(valid_data, my_num_classif)
    dtree = sklearn.ensemble.RandomForestClassifier(n_estimators = my_n_estimators, max_depth=my_max_depth, random_state = 10)
    X = train_data_numeric[decide_columns]
    Y = train_data_numeric["Survived"]
    X_test = pd.get_dummies(valid_data_numeric[decide_columns])
    dtree.fit(X, Y)
    predictions = dtree.predict(X_test)
    return (predictions - valid_data["Survived"]).abs().sum()
    
#Grid search to find the best hyperparameter combination
min_error = None
best_max_depth = None
best_n_estimators = None
best_num_classif = None
for num_classif_candidate in range(2, ):
    for max_depth_candidate in range(2, 6):
        for n_estimators_candidate in range(150, 250, 100):
            crnt_error = find_error(num_classif_candidate, max_depth_candidate, n_estimators_candidate)
            if (min_error is None) or crnt_error < min_error:
                best_num_classif = num_classif_candidate
                best_max_depth = max_depth_candidate
                best_n_estimators = n_estimators_candidate
                min_error = crnt_error

        


In [None]:
#Train random forest and fit results
dtree = sklearn.ensemble.RandomForestClassifier(n_estimators = best_n_estimators, max_depth=best_max_depth, random_state = 10)
train_data_numeric = clean_data(train_data, best_num_classif)
X = train_data_numeric[decide_columns]
Y = train_data_numeric["Survived"]
dtree.fit(X, Y)

In [None]:
test_data_numeric = clean_data(test_data, best_num_classif)
predictions = test_data_numeric["Sex"].to_numpy()
X_test = pd.get_dummies(test_data_numeric[decide_columns])
predictions = dtree.predict(X_test)

#predictions

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission11.csv', index=False)

In [None]:
print(best_num_classif)