# SCIKIT DECISION TREE CLASSIFIER VERSION #

https://scikit-learn.org/stable/modules/tree.html

## Loading libraries ##

In [1]:
from sklearn import tree
import pandas as pd
import seaborn as sns
import graphviz

In [2]:
import matplotlib.pyplot as plt

In [3]:
import numpy as np

In [4]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

In [5]:
from sklearn.model_selection import train_test_split
    

## Loading the data ##

In [6]:
titanic_data = pd.read_csv('../data/train.csv')

## Cleaning up the data ##

### Cabin ###

Is passenger has cabin value is 1, else 0

In [7]:
titanic_data["Cabin"] = titanic_data.Cabin.notna().astype(int)

### Sex ###

Female = 1

Male, NaN, others = 0

In [8]:
mask = titanic_data.Sex=="female"

In [9]:
titanic_data["Sex"] = mask.astype(int)

### Age ###

Cleaning up NaN values as mean values

In [10]:
titanic_data.Age = titanic_data.Age.fillna(titanic_data.Age.mean())

## Data manipulation for train/test splits ##


### Create new dataframe with only used columns ###


In [11]:
feed_data = titanic_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin']].copy()
result_data = titanic_data[['Survived']].copy()

### Splitting data to train and final test sets ###

In [12]:
train_data, test_data, train_result, test_result = train_test_split(feed_data, result_data, test_size=0.2)


### Split training data to parts for K-fold crossvalidation ###

K-fold crossvalidation source: "Data Mining and Analysis, Fundamental Concepts and Algorithms" by Zaki & Meira, page 561

scikit-learn functions used for this.

In [13]:
splits = 5
kf = KFold(n_splits=splits, shuffle=True)


In [14]:
data_folds = []
test_folds = []

for data_fold, test_fold in kf.split(train_data, train_result):
    data_folds.append(data_fold)
    test_folds.append(test_fold)

## Creating decision tree for each fold ##

Forest consist of 5 decision trees, each with its own weight based on how well it worked with coresponding test fold.

In [15]:
forest = []
weights = []
for i in range(splits):
    forest.insert(i,tree.DecisionTreeClassifier(criterion="gini",min_samples_leaf=1))
    forest[i].fit(train_data.iloc[data_folds[i]],train_result.iloc[data_folds[i]])
    weights.insert(i, forest[i].score(train_data.iloc[test_folds[i]],train_result.iloc[test_folds[i]]))

Transforming weights array to numpy array for easier mathematics in making it an actual weight array (sum = 1)

In [16]:
weights = np.array(weights)

weights = weights/weights.sum()

### Making result dataframe ###

correct form from copying the test_results df and then zeroying the Survived column

In [17]:
result = test_result.copy()

result.Survived=0

calculating the survived column for result array, then rounding to integer

followed by comparison to the correct results (abs used to make all differences positive 1 for summation purposes)

score = percentage of correct predictions

In [18]:
for i in range(splits):
    result.Survived+=forest[i].predict(test_data)*weights[i]

In [19]:
result=result.round().astype(np.int64)

In [20]:
score=1-sum(abs(result.Survived-test_result.Survived))/len(result.Survived)
print(score)

0.7653631284916201


For comparison scores of each individual tree

In [21]:
for i in range(splits):
    print(forest[i].score(test_data,test_result))

0.7094972067039106
0.7374301675977654
0.7486033519553073
0.7653631284916201
0.7653631284916201


For comparison score for single tree using full data.

In [22]:
tree_classifier=tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=0.01)
tree_classifier.fit(train_data,train_result)

DecisionTreeClassifier(criterion='entropy', min_samples_leaf=0.01)

In [23]:
tree_classifier.score(test_data,test_result)

0.7932960893854749

# RESULTS #

All scores between close to 0.8, not good enough for use in real data.

## suggestions ##

Minor improvements could be gained from parameters, but nothing in scale to raise score to level of 0.9

Inclusion of further data from analysis of name and more rigorous analysis of family data (trying to identify family units from names and other relationship data). Also nationality from name might provide some improvement.

If other techniques do not provide better results then further analysis on the Name data is warranted.