# Importing Libraries

In [449]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [450]:
features = ["Pclass", "Sex", "Age", "Fare", "Cabin"]

In [451]:
train = pd.read_csv('train.csv')
train.sample(5)
test = pd.read_csv('test.csv')
test_pids = test["PassengerId"]

##  Describing Data

In [452]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [453]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [454]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)

    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    data['Embarked'].fillna("U", inplace=True)
    return data

train = clean(train)
test = clean(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna("U", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [455]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [456]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [457]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
cols = ["Sex", "Embarked"]

for col in cols:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])
    print(label_encoder.classes_)

train.sample(5)

['female' 'male']
['C' 'Q' 'S']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
169,0,3,1,28.0,0,0,56.4958,2
117,0,2,1,29.0,1,0,21.0,2
795,0,2,1,39.0,0,0,13.0,2
40,0,3,0,40.0,1,0,9.475,2
512,1,1,1,36.0,0,0,26.2875,2


In [458]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = train["Survived"]
X = train.drop("Survived", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [459]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,1,45.5,0,0,28.5000,2
733,2,1,23.0,0,0,13.0000,2
382,3,1,32.0,0,0,7.9250,2
704,3,1,26.0,1,0,7.8542,2
813,3,0,6.0,4,2,31.2750,2
...,...,...,...,...,...,...,...
106,3,0,21.0,0,0,7.6500,2
270,1,1,28.0,0,0,31.0000,2
860,3,1,41.0,2,0,14.1083,2
435,1,0,14.0,1,2,120.0000,2


In [460]:
lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
predictions = lr.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)

0.8100558659217877

# Feature Scaling

In [461]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [462]:
X_train

array([[-1.61413602,  0.7243102 ,  1.25364106, ..., -0.47934164,
        -0.07868358,  0.55744438],
       [-0.40055118,  0.7243102 , -0.47728355, ..., -0.47934164,
        -0.37714494,  0.55744438],
       [ 0.81303367,  0.7243102 ,  0.21508629, ..., -0.47934164,
        -0.47486697,  0.55744438],
       ...,
       [ 0.81303367,  0.7243102 ,  0.90745614, ..., -0.47934164,
        -0.35580399,  0.55744438],
       [-1.61413602, -1.38062393, -1.1696534 , ...,  2.04874166,
         1.68320121,  0.55744438],
       [-1.61413602,  0.7243102 , -0.63114352, ...,  0.78470001,
         0.86074761,  0.55744438]])

In [463]:
X_val

array([[ 0.81303367,  0.7243102 , -0.09263364, ...,  0.78470001,
        -0.33390078, -2.01983093],
       [-0.40055118,  0.7243102 ,  0.13815631, ..., -0.47934164,
        -0.42528387,  0.55744438],
       [ 0.81303367,  0.7243102 , -0.7080735 , ..., -0.47934164,
        -0.47486697,  0.55744438],
       ...,
       [ 0.81303367, -1.38062393,  0.67666619, ...,  5.8408666 ,
        -0.02308312,  0.55744438],
       [-0.40055118, -1.38062393, -0.93886345, ..., -0.47934164,
        -0.42528387,  0.55744438],
       [ 0.81303367, -1.38062393, -1.93895323, ...,  0.78470001,
        -0.30589933,  0.55744438]])

# Training Logistic Regression Model

In [464]:
from sklearn.linear_model import LogisticRegression
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)

In [465]:
y_pred = logistic_classifier.predict(X_val)

In [466]:
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1])

In [467]:
y_val

Unnamed: 0,Survived
709,1
439,0
840,0
720,1
39,1
...,...
433,0
773,0
25,1
84,1


In [468]:
y_val = np.array(y_val)

In [469]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_val.reshape(len(y_val),1)),1))

[[0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [0 0]

## Making the confusion matrix

In [470]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_pred)
print(cm)
accuracy_score(y_val, y_pred)

[[90 15]
 [19 55]]


0.8100558659217877

In [471]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, y_train)

In [472]:
y_pred_knn = knn_classifier.predict(X_val)

In [473]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_knn = confusion_matrix(y_val, y_pred_knn)
print(cm_knn)
accuracy_score(y_val, y_pred_knn)

[[90 15]
 [20 54]]


0.8044692737430168

In [474]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear')
classifier_svc.fit(X_train, y_train)

In [475]:
y_pred_svc = classifier_svc.predict(X_val)

In [476]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_svc = confusion_matrix(y_val, y_pred_svc)
print(cm_svc)
accuracy_score(y_val, y_pred_svc)

[[88 17]
 [22 52]]


0.7821229050279329

In [477]:
from sklearn.tree import DecisionTreeClassifier
classifier_dtc = DecisionTreeClassifier(criterion = 'entropy')
classifier_dtc.fit(X_train, y_train)

In [478]:
y_pred_dtc = classifier_dtc.predict(X_val)

In [479]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_dtc = confusion_matrix(y_val, y_pred_dtc)
print(cm_dtc)
accuracy_score(y_val, y_pred_dtc)

[[84 21]
 [18 56]]


0.7821229050279329

In [480]:
from sklearn.ensemble import RandomForestClassifier
for i in range(100, 1000, 100):
    classifier_rfc = RandomForestClassifier(n_estimators = i, criterion = 'entropy', random_state = 0)
    classifier_rfc.fit(X_train, y_train)
    y_pred_rfc = classifier_rfc.predict(X_val)
    from sklearn.metrics import confusion_matrix, accuracy_score
    cm_rfc = confusion_matrix(y_val, y_pred_rfc)
    print(cm_rfc)
    print(accuracy_score(y_val, y_pred_rfc))

[[91 14]
 [18 56]]
0.8212290502793296
[[90 15]
 [18 56]]
0.8156424581005587
[[91 14]
 [18 56]]
0.8212290502793296
[[90 15]
 [18 56]]
0.8156424581005587
[[90 15]
 [18 56]]
0.8156424581005587
[[90 15]
 [18 56]]
0.8156424581005587
[[90 15]
 [17 57]]
0.8212290502793296
[[90 15]
 [17 57]]
0.8212290502793296
[[90 15]
 [17 57]]
0.8212290502793296


In [481]:
# So the best output is from n_estimators = 100

In [486]:
from sklearn.model_selection import cross_val_score
submission = classifier_rfc.predict(test)
submission_df = pd.DataFrame({"PassengerId": test_pids.values,
                   "Survived": submission,
                  })
cross_val_score = cross_val_score(classifier_rfc, X, y, cv=5)
print(cross_val_score)
print(np.mean(cross_val_score))



[0.78212291 0.80337079 0.84831461 0.78089888 0.82022472]
0.8069863787583955
