In [1]:
# importing required packages
import pandas as pd
import numpy as np


In [2]:
# reading input files
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
# =========================== Data Cleaning =============================== #
#Null count in percentage
def null_sum(x):
    return (sum(x.isnull()) / len(x)) * 100

def getImputeFree(dataset):
    print("------ > Before Imputing ")
    print(dataset.apply(null_sum, axis=0))                                             # Checking missing value count in percentage BEFORE performing imputting

    for col in list(dataset.columns):
        garbage_status = null_sum(dataset[col])                                        # getting missing value in percentage
        if garbage_status > 0 and garbage_status <= 40:                                # replacing missing values which are less than 40% of data-set length
            try:
                dataset[col] = dataset[col].fillna(round(np.mean(dataset[col]),3))     # used mean as impute -> non-categorical data
                print(col, "(mean) : ",round(np.mean(dataset[col]),3))
            except:
                mode_class = np.array(train_data['Embarked'].value_counts().keys())[0] # getting mode of categorical data
                dataset[col] = dataset[col].fillna(mode_class)                         # used mode as impute -> categorical data
                print(col, "(mode) : ", mode_class)
        elif garbage_status > 40:                                                      # greater than 40 % missing values those columns are dropped
            print("Column have lot of Impurity / Missing Values so Dropped them : ",col)
            dataset.drop(col,axis=1,inplace=True)                                      # dropping the column

    print("------ > After Imputing ")
    print(dataset.apply(null_sum, axis=0))                                             # Checking missing value count in percentage AFTER performing imputing

getImputeFree(train_data)
getImputeFree(test_data)

------ > Before Imputing 
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64
Age (mean) :  29.699
Column have lot of Impurity / Missing Values so Dropped them :  Cabin
Embarked (mode) :  S
------ > After Imputing 
PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64
------ > Before Imputing 
PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.0000

In [4]:
# ============================ Features creating using one-hot encoder============================ #
train_predictors = train_data.drop(['PassengerId','Survived','Name','Ticket'], axis=1)   # dropped un-wanted cols and target column
categorical_cols = [cname for cname in train_predictors.columns if train_predictors[cname].nunique() < 10 and train_predictors[cname].dtype == "object"]  # selected category col
numeric_cols = [cname for cname in train_predictors.columns if train_predictors[cname].dtype in ['int64','float64']]  # selected numerical col
my_cols = categorical_cols + numeric_cols  # selected col
train_predictors = train_predictors[my_cols]
encoded_train_predictors = pd.get_dummies(train_predictors)   # one-hot encoded


In [5]:
# ============================== #
y_target = train_data['Survived'].values
x_features = encoded_train_predictors.values

# splitting train and validation set
from sklearn.model_selection import train_test_split
x_train, x_validate, y_train, y_validate = train_test_split(x_features, y_target, test_size=0.20, random_state=1)


# model training
from sklearn.tree import DecisionTreeClassifier
clf_model = DecisionTreeClassifier()
clf_model.fit(x_train, y_train)

# predicting
validate_predict =  clf_model.predict(x_validate)

# model validating
from sklearn.metrics import confusion_matrix, accuracy_score
print("================================== C4.5 Confusion Matrix =============================")
print(confusion_matrix(y_validate, validate_predict))
print("================================== C4.5 Accuracy Score ===============================")
print(accuracy_score(y_validate, validate_predict))


test_predictors = test_data.drop(['PassengerId','Name','Ticket'], axis=1)
test_x_featuers = pd.get_dummies(test_predictors).values
test_predict = clf_model.predict(test_x_featuers)


[[89 17]
 [28 45]]
0.7486033519553073


In [6]:
#===== ID3 =====#
id3_clf_model = DecisionTreeClassifier(criterion="entropy")
id3_clf_model.fit(x_train, y_train)

# predicting
validate_predict =  id3_clf_model.predict(x_validate)

# model validating
from sklearn.metrics import confusion_matrix, accuracy_score
print("================================== ID3 Confusion Matrix =============================")
print(confusion_matrix(y_validate, validate_predict))
print("================================== ID3 Accuracy Score ===============================")
print(accuracy_score(y_validate, validate_predict))


test_predictors = test_data.drop(['PassengerId','Name','Ticket'], axis=1)
test_x_featuers = pd.get_dummies(test_predictors).values
id3_test_predict = id3_clf_model.predict(test_x_featuers)

[[89 17]
 [21 52]]
0.7877094972067039
