In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import sklearn.tree
import sklearn.ensemble
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.linear_model import LogisticRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
#Load training, testing, and validation data
data = pd.read_csv('/kaggle/input/titanic/train.csv')
data = data.sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
print(data.shape)

In [5]:
#Split into train and validation
N_TRAIN = 601
train_data = data.iloc[0:600]
valid_data = data.iloc[601:-1]
train_data.head(15)

In [6]:
#Data Cleaning
#Drop columns with too many null values
for col_name in train_data.columns:
    num_null = train_data[col_name].isnull().sum()
    if num_null > 100:
        train_data.drop(col_name, axis = 1, inplace = True)
        valid_data.drop(col_name, axis = 1, inplace = True)
        test_data.drop(col_name, axis = 1, inplace = True)

#Drop rows with too many null values
drop_row_list = []
for row_num in range(N_TRAIN-1):
    row = train_data.iloc[row_num]
    num_null = row.isnull().sum()
    if num_null > 5:
        #Drop row
        drop_row_list.append(row_num)
print(drop_row_list)

In [15]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = list(train_data.select_dtypes(include=numerics).columns)
if 'PassengerId' in numeric_cols:
    numeric_cols.remove('PassengerId')
if 'Survived' in numeric_cols:
    numeric_cols.remove('Survived')

categorical_cols = []
for col_name in train_data.columns:
    if col_name not in numeric_cols and col_name != 'PassengerId' and col_name != 'Survived' and col_name != 'Name':
        categorical_cols.append(col_name)

In [46]:
#Fill null values with something
train_data.isnull().sum().head(15)
train_data['Embarked'].value_counts()
train_data['Embarked'].fillna('S', inplace = True)
test_data['Embarked'].fillna('S', inplace = True)

In [16]:
#train_data['MSSubClass'].head()
for col_name in numeric_cols:
    print(col_name, 
          train_data[train_data['Survived'] == 0][col_name].mean(), 
          train_data[train_data['Survived'] == 0][col_name].std(),
          train_data[train_data['Survived'] == 1][col_name].mean(),
          train_data[train_data['Survived'] == 1][col_name].std())

In [17]:
#For each categorical column, do a bar chart
for col_name in categorical_cols:
    labels = list(train_data[col_name].unique())
    means = {}
    for label in labels:
        means[label] = train_data[train_data[col_name] == label]['Survived'].mean()
    print(col_name, means)

In [19]:
print(numeric_cols)
numeric_cols.remove('SibSp')
categorical_cols.append('SibSp')
numeric_cols.remove('Parch')
categorical_cols.append('Parch')

In [20]:
#Random forest regressor needs numerical columns for some strange reason
def numericalize(col_name, df):
    set_of_values = set()
    for value in df[col_name].values:
        set_of_values.add(value)
    index = 0
    my_dict = {}
    for value in set_of_values:
        my_dict[value] = index
        index = index+1
    return df[col_name].replace(to_replace = my_dict)

categorical_cols_num = []
for col in categorical_cols:
    categorical_cols_num.append(col + '_num')
    train_data[col + '_num'] = numericalize(col, train_data)
    valid_data[col + '_num'] = numericalize(col, valid_data)
    test_data[col + '_num'] = numericalize(col, test_data)

In [21]:
#Implement scaling features, first convert to float
#Helps for some models, not sure about XGBRegressor
for col_name in numeric_cols:
    train_data[col_name] = train_data[col_name].astype(float)
    valid_data[col_name] = valid_data[col_name].astype(float)
    test_data[col_name] = test_data[col_name].astype(float)
    mean_value = train_data[col_name].mean()
    std_value = train_data[col_name].std()
    train_data[col_name] = (train_data[col_name]-mean_value)/std_value
    valid_data[col_name] = (valid_data[col_name]-mean_value)/std_value
    test_data[col_name] = (test_data[col_name]-mean_value)/std_value

In [73]:
#Train random forest and fit results
dtree = sklearn.ensemble.RandomForestClassifier(n_estimators = 250, max_depth=3, random_state = 8)
dtree.fit(train_data[numeric_cols + categorical_cols_num], train_data["Survived"])
print(dtree.score(valid_data[numeric_cols + categorical_cols_num], valid_data['Survived']))

xgbtree = xgb.XGBClassifier().fit(train_data[numeric_cols + categorical_cols_num], train_data['Survived'])
print(xgbtree.score(valid_data[numeric_cols + categorical_cols_num], valid_data['Survived']))

logistic = LogisticRegression()
logistic.fit(train_data[numeric_cols], train_data["Survived"])
print(logistic.score(valid_data[numeric_cols], valid_data['Survived']))

In [52]:
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
test_data.isna().sum().head(12)

In [75]:
best_model = xgbtree
test_data['Survived'] = best_model.predict(test_data[numeric_cols + categorical_cols_num])
ans = test_data[['PassengerId', 'Survived']]
ans.to_csv('tuned_output.csv')

In [None]:
print(best_num_classif)