# Tasnuba Binte Jamal (Titanic dataset)

In [1]:
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Additional machine learning models
from sklearn.metrics import accuracy_score
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
train_df = pd.read_csv(r'C:\Users\ta444503\OneDrive - Knights - University of Central Florida\Desktop\ML\HW 1\data\train.csv')
test_df = pd.read_csv(r'C:\Users\ta444503\OneDrive - Knights - University of Central Florida\Desktop\ML\HW 1\data\test.csv')
combine = [train_df, test_df]

In [4]:
train_df.isnull().sum().sort_values()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
Age            177
Cabin          687
dtype: int64

In [5]:
test_df.isnull().sum().sort_values()

PassengerId      0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Embarked         0
Fare             1
Age             86
Cabin          327
dtype: int64

In [6]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_df.describe(include=['O'])  # Selecting only variables that contain strings

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [8]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


### Data wrangling (data manipulation and feature engineering)

In [10]:
# Dropping 'Ticket' and 'Cabin' columns
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape

Before (891, 12) (418, 11) (891, 12) (418, 11)


('After', (891, 10), (418, 9), (891, 10), (418, 9))

In [11]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [12]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [13]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [14]:
train_df.groupby(['Title']).count()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,517,517,517,517,517,398,517,517,517,517
2,185,185,185,185,185,149,185,185,185,184
3,126,126,126,126,126,109,126,126,126,125
4,40,40,40,40,40,36,40,40,40,40
5,23,23,23,23,23,22,23,23,23,23


In [15]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

((891, 9), (418, 9))

In [16]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


In [18]:
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

In [19]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),
                        'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,1
1,1,1,1,38,1,0,71.2833,C,3
2,1,3,1,26,0,0,7.925,S,2
3,1,1,1,35,1,0,53.1,S,3
4,0,3,0,35,0,0,8.05,S,1


In [20]:
# Checking the age bands
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [21]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand
0,0,3,0,1,1,0,7.25,S,1,"(16.0, 32.0]"
1,1,1,1,2,1,0,71.2833,C,3,"(32.0, 48.0]"
2,1,3,1,1,0,0,7.925,S,2,"(16.0, 32.0]"
3,1,1,1,2,1,0,53.1,S,3,"(32.0, 48.0]"
4,0,3,0,2,0,0,8.05,S,1,"(32.0, 48.0]"


In [22]:
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,7.25,S,1
1,1,1,1,2,1,0,71.2833,C,3
2,1,3,1,1,0,0,7.925,S,2
3,1,1,1,2,1,0,53.1,S,3
4,0,3,0,2,0,0,8.05,S,1


In [23]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [24]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [25]:
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,1,7.25,S,1,0
1,1,1,1,2,71.2833,C,3,0
2,1,3,1,1,7.925,S,2,1
3,1,1,1,2,53.1,S,3,0
4,0,3,0,2,8.05,S,1,1


In [26]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


In [27]:
freq_port = train_df.Embarked.dropna().mode()[0]
freq_port

'S'

In [28]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [29]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,7.25,0,1,0,3
1,1,1,1,2,71.2833,1,3,0,2
2,1,3,1,1,7.925,0,2,1,3
3,1,1,1,2,53.1,0,3,0,2
4,0,3,0,2,8.05,0,1,1,6


In [30]:
# Remove outliers from Fare:
train_df = train_df[train_df['Fare']<=400]
train_df.shape

(888, 9)

In [31]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,892,3,0,2,7.8292,2,1,1,6
1,893,3,1,2,7.0,0,3,0,6
2,894,2,0,3,9.6875,2,1,1,6
3,895,3,0,1,8.6625,0,1,1,3
4,896,3,1,1,12.2875,0,3,0,3


In [33]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)
# qcut is used to give quantile cuts. 4 gives us quartiles, 10 gives us deciles, etc.
# Discretize variable into equal-sized buckets based on rank or based on sample quantiles. 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)


Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.896]",0.197309
1,"(7.896, 14.454]",0.303571
2,"(14.454, 30.772]",0.452055
3,"(30.772, 263.0]",0.576577


In [34]:
train_df.loc[ train_df['Fare'] <= 7.896, 'Fare'] = 0
train_df.loc[(train_df['Fare'] > 7.896) & (train_df['Fare'] <= 14.454), 'Fare'] = 1
train_df.loc[(train_df['Fare'] > 14.454) & (train_df['Fare'] <= 30.772), 'Fare']   = 2
train_df.loc[ train_df['Fare'] > 30.772, 'Fare'] = 3
train_df['Fare'] = train_df['Fare'].astype(int)
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Fare'] = train_df['Fare'].astype(int)


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class,FareBand
0,0,3,0,1,0,0,1,0,3,"(-0.001, 7.896]"
1,1,1,1,2,3,1,3,0,2,"(30.772, 263.0]"
2,1,3,1,1,1,0,2,1,3,"(7.896, 14.454]"
3,1,1,1,2,3,0,3,0,2,"(30.772, 263.0]"
4,0,3,0,2,1,0,1,1,6,"(7.896, 14.454]"
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,1,1,0,5,1,2,"(7.896, 14.454]"
887,1,1,1,1,2,0,2,1,1,"(14.454, 30.772]"
888,0,3,1,1,2,0,2,0,3,"(14.454, 30.772]"
889,1,1,0,1,2,1,1,1,1,"(14.454, 30.772]"


In [35]:
test_df.loc[ test_df['Fare'] <= 7.896, 'Fare'] = 0
test_df.loc[(test_df['Fare'] > 7.896) & (test_df['Fare'] <= 14.454), 'Fare'] = 1
test_df.loc[(test_df['Fare'] > 14.454) & (test_df['Fare'] <= 30.772), 'Fare']   = 2
test_df.loc[ test_df['Fare'] > 30.772, 'Fare'] = 3
test_df['Fare'] = test_df['Fare'].astype(int)
test_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,892,3,0,2,0,2,1,1,6
1,893,3,1,2,0,0,3,0,6
2,894,2,0,3,1,2,1,1,6
3,895,3,0,1,1,0,1,1,3
4,896,3,1,1,1,0,3,0,3
...,...,...,...,...,...,...,...,...,...
413,1305,3,0,1,1,0,1,1,3
414,1306,1,1,2,3,1,5,1,2
415,1307,3,0,2,0,0,1,1,6
416,1308,3,0,1,1,0,1,1,3


In [36]:
train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,0,0,1,0,3
1,1,1,1,2,3,1,3,0,2
2,1,3,1,1,1,0,2,1,3
3,1,1,1,2,3,0,3,0,2
4,0,3,0,2,1,0,1,1,6
5,0,3,0,1,1,2,1,1,3
6,0,1,0,3,3,0,1,1,3
7,0,3,0,0,2,0,4,0,0
8,1,3,1,1,1,0,3,0,3
9,1,2,1,0,2,1,3,0,0


In [38]:
test_df.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,892,3,0,2,0,2,1,1,6
1,893,3,1,2,0,0,3,0,6
2,894,2,0,3,1,2,1,1,6
3,895,3,0,1,1,0,1,1,3
4,896,3,1,1,1,0,3,0,3
5,897,3,0,0,1,0,1,1,0
6,898,3,1,1,0,2,2,1,3
7,899,2,0,1,2,0,1,0,2
8,900,3,1,1,0,1,3,1,3
9,901,3,0,1,2,0,1,0,3


In [41]:
train_df.shape

(888, 9)

In [42]:
for cols in train_df.columns:
    print(cols)
    print(train_df[str(cols)].unique())

Survived
[0 1]
Pclass
[3 1 2]
Sex
[0 1]
Age
[1 2 3 0 4]
Fare
[0 3 1 2]
Embarked
[0 1 2]
Title
[1 3 2 4 5]
IsAlone
[0 1]
Age*Class
[ 3  2  6  0  4  1  8  9 12]


In [43]:
train_df = pd.get_dummies(train_df, columns = ["Pclass","Age","Fare","Embarked","Title"],
                                     prefix=["CLASS","AGE","FARE_TYPE","EM_TYPE","TITLE"])

In [44]:
test_df  = pd.get_dummies(test_df, columns = ["Pclass","Age","Fare","Embarked","Title"],
                                     prefix=["CLASS","AGE","FARE_TYPE","EM_TYPE","TITLE"])

In [45]:
test_df.columns

Index(['PassengerId', 'Sex', 'IsAlone', 'Age*Class', 'CLASS_1', 'CLASS_2',
       'CLASS_3', 'AGE_0', 'AGE_1', 'AGE_2', 'AGE_3', 'AGE_4', 'FARE_TYPE_0',
       'FARE_TYPE_1', 'FARE_TYPE_2', 'FARE_TYPE_3', 'EM_TYPE_0', 'EM_TYPE_1',
       'EM_TYPE_2', 'TITLE_1', 'TITLE_2', 'TITLE_3', 'TITLE_4', 'TITLE_5'],
      dtype='object')

In [46]:
test_df.shape

(418, 24)

## Modelling

In [47]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((888, 23), (888,), (418, 23))

In [48]:
df1_cols = train_df.columns#X_train.columns
df2_cols = test_df.columns#X_test.columns

common_cols = df1_cols.intersection(df2_cols)
uncommon_cols = df1_cols.difference(df2_cols)

uncommon_cols   # If it is blank then we are OK. Because we are sure that they have the same columns.

# We see that the train_df has one more column called "Survived"

Index(['Survived'], dtype='object')

In [49]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

81.76

In [51]:
# Support Vector Machines

svc = SVC(gamma='auto')
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

# gamma needs to be set to 'auto', instead of setting to the default 'scale'

81.42

In [52]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


84.35

In [53]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

77.7

In [54]:
# Perceptron

#perceptron = Perceptron(penalty=None, alpha=0.0001, fit_intercept=True, max_iter=None, tol=None, shuffle=True, 
#                        verbose=0, eta0=1.0, n_jobs=1, random_state=0, class_weight=None, warm_start=False)
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron


# The original notebook in the question used different values for max_iter and tol, which I could not replicate
# Also another argument, n_iter has been that was found in v 0.19 has been removed now in v 0.24.
# So, that's why the result is slightly changed

73.2

In [55]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc



82.32

In [56]:
# Stochastic Gradient Descent

sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
# Changes with each run
# Highest I got was 83.0

79.73

In [57]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

86.82

In [58]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

86.82

In [59]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold #for K-fold cross validation
model = GradientBoostingClassifier()
param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300,400],
              'learning_rate': [0.1, 0.05, 0.01,0.001],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.2,0.1] 
              }
kfold = KFold(n_splits=10, random_state=22, shuffle = True)
modelf = GridSearchCV(model,param_grid = param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

modelf.fit(X_train, Y_train)

# Best score
print(modelf.best_score_)

# Best Estimator
modelf.best_estimator_

Fitting 10 folds for each of 192 candidates, totalling 1920 fits
0.8176200204290092


GradientBoostingClassifier(max_depth=8, max_features=0.2, min_samples_leaf=100,
                           n_estimators=200)

In [61]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, Y_train)
Y_pred =xgb_cl.predict(X_test)
#Y_train_accuracy_labels = xgb_cl.predict(X_train)
acc_xgboost = round(accuracy_score(Y_train, xgb_cl.predict(X_train)) *100,2)
acc_xgboost



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


86.71

In [64]:
xgb_cl  = GradientBoostingClassifier(learning_rate=0.05, max_depth=8, max_features=0.3,
                           min_samples_leaf=100, n_estimators=400)  
xgb_cl.fit(X_train, Y_train)
Y_pred =xgb_cl.predict(X_test)
acc_xgboost2 = round(accuracy_score(Y_train, xgb_cl.predict(X_train)) *100,2)
acc_xgboost2

83.11

### Model Evaluation

In [65]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree','XGBoost'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree,acc_xgboost]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,86.82
8,Decision Tree,86.82
9,XGBoost,86.71
1,KNN,84.35
7,Linear SVC,82.32
2,Logistic Regression,81.76
0,Support Vector Machines,81.42
6,Stochastic Gradient Decent,79.73
4,Naive Bayes,77.7
5,Perceptron,73.2


In [66]:
output = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
output.to_csv(r'C:\Users\ta444503\OneDrive - Knights - University of Central Florida\Desktop\ML\test_data5.csv', index=False)
