In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import prepare
import warnings

In [2]:
train, test = prepare.prep_titanic()
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
329,329,1,1,female,0.195778,0,1,0.113168,0,0
749,749,0,3,male,0.384267,0,0,0.015127,1,1
203,203,0,3,male,0.566474,0,0,0.014102,0,1
421,421,0,3,male,0.258608,0,0,0.015094,1,1
97,97,1,1,male,0.28374,0,1,0.123667,0,0


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
train, validate = train_test_split(train, random_state=123, train_size=.8)
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 10 columns
   train: 569 rows x 10 columns
validate: 143 rows x 10 columns


In [5]:
X = train[['fare', 'pclass']]
y = train.survived
model = LogisticRegression().fit(X, y)
y_pred = model.predict(X)

In [6]:
model.classes_

array([0, 1])

In [7]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.76       345
           1       0.65      0.41      0.50       224

    accuracy                           0.68       569
   macro avg       0.67      0.63      0.63       569
weighted avg       0.67      0.68      0.66       569



In [8]:
X = train[['fare', 'pclass','age']]
y = train.survived
model = LogisticRegression().fit(X, y)
y_pred = model.predict(X)

In [9]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.86      0.78       345
           1       0.69      0.47      0.56       224

    accuracy                           0.71       569
   macro avg       0.70      0.67      0.67       569
weighted avg       0.71      0.71      0.69       569



In [32]:
import sklearn.preprocessing
def encode_embarked(train, validate, test):
    encoder = sklearn.preprocessing.LabelEncoder()
    train['sex'] = encoder.fit_transform(train[['sex']])
    validate['sex'] = encoder.fit_transform(validate[['sex']])
    test['sex'] = encoder.fit_transform(test[['sex']])
    return train, validate, test

In [33]:
train, validate, test = encode_embarked(train, validate, test)

In [34]:
X = train[['fare', 'pclass','age','sex']]
y = train.survived
model = LogisticRegression().fit(X, y)
y_pred = model.predict(X)

In [35]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       435
           1       0.74      0.69      0.72       277

    accuracy                           0.79       712
   macro avg       0.78      0.77      0.77       712
weighted avg       0.79      0.79      0.79       712



In [36]:
X_validate = validate[['fare', 'pclass','age','sex']]
y_validate = validate.survived
model = LogisticRegression().fit(X_validate, y_validate)
y_pred = model.predict(X_validate)

In [37]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85        90
           1       0.76      0.70      0.73        53

    accuracy                           0.80       143
   macro avg       0.79      0.78      0.79       143
weighted avg       0.80      0.80      0.80       143



In [38]:
X = train[['fare', 'pclass','age','sex','sibsp']]
y = train.survived
model = LogisticRegression().fit(X, y)
y_pred = model.predict(X)

In [39]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       435
           1       0.76      0.68      0.72       277

    accuracy                           0.79       712
   macro avg       0.78      0.77      0.78       712
weighted avg       0.79      0.79      0.79       712



In [40]:
X = test[['fare', 'pclass','age','sex','sibsp']]
y = test.survived
model = LogisticRegression().fit(X, y)
y_pred = model.predict(X)

In [41]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       114
           1       0.86      0.66      0.75        65

    accuracy                           0.84       179
   macro avg       0.84      0.80      0.81       179
weighted avg       0.84      0.84      0.83       179



In [42]:
from seaborn import load_dataset
from sklearn.tree import DecisionTreeClassifier

In [68]:
X_train = train[['fare', 'pclass','age','sex','sibsp']]
y_train = train.survived
X_train.head()

Unnamed: 0,fare,pclass,age,sex,sibsp
329,0.113168,1,0.195778,0,0
749,0.015127,3,0.384267,1,0
203,0.014102,3,0.566474,1,0
421,0.015094,3,0.258608,1,0
97,0.123667,1,0.28374,1,0


In [69]:
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
# The pattern for sklearn is:
# 1. Make a thing (a new, blank machine learning model of a specific kind)
# 2. Fit that thing (.fitting means to train the machine learning model)
# 3. Use that thing (we'll use our trained model to make predictions on future datapoints)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
clf

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [70]:
# The easiest part of the entire Data Science pipeline is fitting the machine learning model...
# It's almost anticlimatic...
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [71]:
# Produce a set of species predictions
# Calculate the predicted probability that the prediction is correct
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

In [72]:
y_pred[0:10]

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [73]:
y_pred_proba

array([[0.01162791, 0.98837209],
       [0.88269795, 0.11730205],
       [0.88269795, 0.11730205],
       ...,
       [0.88269795, 0.11730205],
       [0.88269795, 0.11730205],
       [0.88269795, 0.11730205]])

In [81]:
pd.crosstab(y_pred,y_train)

survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,380,74
1,55,203


In [82]:
Accuracy = (380+203)/(380+203+55+74)
print(Accuracy)
true_positive_rate = 203/(380+203+55+74) 
print(true_positive_rate)#false positive rate, true negative rate, false negative rate

0.8188202247191011
0.2851123595505618


In [58]:
# Accuracy = total number of (true positives + number of true negatives) divided by the total numbrer of observations
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [59]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       435
           1       0.79      0.73      0.76       277

    accuracy                           0.82       712
   macro avg       0.81      0.80      0.81       712
weighted avg       0.82      0.82      0.82       712



In [60]:
# Precision, Recall, f1 score, and more!
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.837004,0.786822,0.81882,0.811913,0.817481
recall,0.873563,0.732852,0.81882,0.803208,0.81882
f1-score,0.854893,0.758879,0.81882,0.806886,0.817539
support,435.0,277.0,0.81882,712.0,712.0


In [62]:
X_validate = validate[['fare', 'pclass','age','sex','sibsp']]
y_validate = validate.survived
clf.fit(X_validate, y_validate)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [64]:
y_pred = clf.predict(X_validate)
y_pred_proba = clf.predict_proba(X_validate)

In [65]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90        90
           1       0.83      0.81      0.82        53

    accuracy                           0.87       143
   macro avg       0.86      0.86      0.86       143
weighted avg       0.87      0.87      0.87       143



In [66]:
pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.89011,0.826923,0.867133,0.858516,0.866691
recall,0.9,0.811321,0.867133,0.85566,0.867133
f1-score,0.895028,0.819048,0.867133,0.857038,0.866867
support,90.0,53.0,0.867133,143.0,143.0
