In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from pydataset import data
from datetime import date
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')
from env import get_db_url, user, password, host
import acquire
import prepare
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [2]:
# Using titanic data:
# 1. What is your baseline prediction? What is your baseline accuracy? 

In [3]:
titanic = acquire.get_titanic_data()

Using cached csv


In [4]:
titanic = prepare.prep_titanic(titanic)

In [5]:
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [6]:
titanic.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [7]:
# Our target value is 'survived' and the baseline is 0 (did not survive)

In [8]:
# added baseline too early
# titanic['baseline'] = 0

In [9]:
# 2. Fit the decision tree classifier to training sample and transform
# Create train, validate, test split of data

In [10]:
def train_validate_test_split(df, target, seed=123):
    '''
    - Takes in df, name of target variable, and integer for setting a seed
    - Splits the data into train, validate, and test
    - Test is 20% of original dataset, validate is .3 * .8 = 24%, train is .7 * .8 = 56% of original dataset
    - Function returns train, validate, test df
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state = seed, stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=.3, random_state=seed, stratify=train_validate[target])
    return train, validate, test

In [11]:
train, val, test = train_validate_test_split(titanic, 'survived')

In [12]:
train.shape, val.shape, test.shape

((498, 10), (214, 10), (179, 10))

In [13]:
train['baseline'] = 0

In [14]:
sklearn.metrics.accuracy_score(train.baseline, train.survived)

0.6164658634538153

In [15]:
# Need to make sure you do not carry the baseline forward, will change features and modeling accuracy
train.drop(columns='baseline', inplace=True)

In [16]:
# [dataset.drop(columns=drops, inplace=True) for dataset in [train, validate, test]]

In [17]:
x_train = train.drop(columns='survived')
y_train = train.survived

x_validate = val.drop(columns=['survived'])
y_validate = val.survived

x_test = test.drop(columns='survived')
y_test = test.survived

In [18]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [19]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [20]:
clf = clf.fit(x_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
train.head()

In [None]:
plt.figure(figsize=(13,7))
plot_tree(clf, feature_names=x_train.columns, class_names=['not survived', 'survived'], rounded = True)

In [None]:
# 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report

In [None]:
accuracy = clf.score(x_train, y_train)
y_pred = clf.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
y_pred = clf.predict(x_train)

In [None]:
y_pred_proba = clf.predict_proba(x_train)

In [None]:
clf.score(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
labels = sorted(y_train.unique())

In [None]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred))

In [None]:
# 4. Conpute accuracy, true positive rate, false positive rate, true negative reate, false negative rate, precision
# recall, f1-score, support
clf.score(x_validate, y_validate)

In [None]:
y_pred = clf.predict(x_validate)

In [None]:
print(classification_report(y_validate, y_pred))

In [None]:
# 5. Repeat with a diff. max_depth

In [None]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

In [None]:
clf = clf.fit(x_train, y_train)

In [None]:
plt.figure(figsize=(20,20))
plot_tree(clf, feature_names=x_train.columns, class_names=['not survived', 'survived'], rounded = True)

In [None]:
y_pred = clf.predict(x_train)

In [None]:
y_pred_proba = clf.predict_proba(x_train)

In [None]:
clf.score(x_train, y_train)

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
clf.score(x_validate, y_validate)

In [None]:
y_pred = clf.predict(x_validate)

In [None]:
print(classification_report(y_validate, y_pred))

In [None]:
# 6. The larger max_depth performs better on the in sample (train) data
# 7. The larger max_depth performs better for the validate set as well. 

In [None]:
# from sklearn.tree import export_graphviz
# import graphviz

In [None]:
# dot_data = export_graphviz(clf, feature_names = x_train.columns, rounded=True, filled=True, out_file=None)
# graph = graphviz.Source(dot_data)

In [None]:
############################# Do these exercises using the telco dataset ########################################

In [None]:
telco = acquire.get_telco_data()

In [None]:
telco = prepare.prep_telco(telco)

In [None]:
telco.head()

In [None]:
# Locating the NaN values in df
is_nan = telco.isnull()
row_has_nan = is_nan.any(axis=1)
rows_with_nan = telco[row_has_nan]

In [None]:
telco.churn_Yes.value_counts()

In [None]:
telco.isnull().values.any()

In [None]:
train, validate, test = train_validate_test_split(telco, 'churn_Yes', seed=123)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
train['baseline'] = 0

In [None]:
accuracy_score(train.churn_Yes, train.baseline)

In [None]:
train.drop(columns='baseline', inplace=True)

In [None]:
x_train = train.drop(columns='churn_Yes')
x_validate = validate.drop(columns='churn_Yes')
x_test = test.drop(columns='churn_Yes')

In [None]:
y_train = train.churn_Yes
y_validate = validate.churn_Yes
y_test = test.churn_Yes

In [None]:
clf = DecisionTreeClassifier(max_depth = 3, random_state=123)

In [None]:
clf = clf.fit(x_train, y_train)

In [None]:
plt.figure(figsize=(18, 18))
plot_tree(clf, feature_names=x_train.columns, class_names=['no churn', 'churn'], rounded=True)

In [None]:
y_pred = clf.predict(x_train)

In [None]:
y_prep_proba = clf.predict_proba(x_train)

In [None]:
clf.score(x_train, y_train)

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
clf.score(x_validate, y_validate)

In [None]:
y_pred = clf.predict(x_validate)

In [None]:
print(classification_report(y_validate, y_pred))

In [None]:
clf = DecisionTreeClassifier(max_depth = 2, random_state=123)

In [None]:
clf = clf.fit(x_train, y_train)

In [None]:
plt.figure(figsize=(13,7))
plot_tree(clf, feature_names = x_train.columns, class_names=['no churn', 'churn'], rounded = True)

In [None]:
y_pred = clf.predict(x_train)

In [None]:
clf.score(x_train, y_train)

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
clf.score(x_validate, y_validate)

In [None]:
y_pred = clf.predict(x_validate)
print(classification_report(y_validate, y_pred))

In [None]:
# There is not as much difference as I would've expected between the max_depth of 2 or 3 for this data. The larger
# max_depth still provides better results, but not by as much as I would have guessed. 

In [None]:
################################################################################################################

In [None]:
# Random Forest Exercise

In [None]:
# 1. Fit the Random Forest classifier to your training sample and tansform, setting random_state accordingly and 
# setting min_samples_leaf = 1 and max_depth = 10

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
titanic = prepare.prep_titanic(titanic)

In [None]:
train, validate, test = train_validate_test_split(titanic, 'survived', seed=123)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
train['baseline'] = 0
accuracy_score(train.survived, train.baseline)
train.drop(columns='baseline', inplace=True)

In [None]:
x_train = train.drop(columns='survived')
y_train = train.survived

In [None]:
x_validate = validate.drop(columns='survived')
y_validate = validate.survived
x_test = test.drop(columns='survived')
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', min_samples_leaf=1, max_depth=10,
                           n_estimators=100, random_state=123)

In [None]:
rf.fit(x_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(x_train)

In [None]:
y_pred_proba = rf.predict_proba(x_train)

In [None]:
rf.score(x_train, y_train)

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
rf.score(x_validate, y_validate)

In [None]:
accuracy = rf.score(x_train, y_train)
y_pred = rf.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
# 4. Repeat above steps using different min_samples_leaf and max_depth values

In [None]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', min_samples_leaf=2, max_depth=8,
                           n_estimators=100, random_state=123)

In [None]:
rf.fit(x_train, y_train)

In [None]:
accuracy = rf.score(x_train, y_train)
y_pred = rf.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
rf.score(x_validate, y_validate)

In [None]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', min_samples_leaf=3, max_depth=6,
                           n_estimators=100, random_state=123)

In [None]:
rf.fit(x_train, y_train)

In [None]:
accuracy = rf.score(x_train, y_train)
y_pred = rf.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
rf.score(x_validate, y_validate)

In [None]:
# It looks like the one with the highest max_depth and lowest min_sample_leaf performs the best in terms of metrics
# but I wonder to what degree this is just overfitting. 

In [None]:
#################################################         KNN Exercise

In [None]:
# 1. Fit a K-Nearest Neighbors classifier to your training sample and transform

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
titanic = prepare.prep_titanic(titanic)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
titanic.head()

In [None]:
titanic['baseline'] = 0

In [None]:
titanic.survived.value_counts()

In [None]:
# Baseline accuracy for 'survival' target variable
accuracy_score(titanic.survived, titanic.baseline)

In [None]:
titanic.drop(columns='baseline', inplace=True)

In [None]:
train, validate, test = train_validate_test_split(titanic, 'survived', seed=123)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
x_train = train.drop(columns='survived')
y_train = train.survived

In [None]:
x_validate = validate.drop(columns='survived')
y_validate = validate.survived
x_test = test.drop(columns='survived')
y_test = test.survived

In [None]:
# Create KNN object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
# Fit model to training data
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_train)

In [None]:
y_pred_proba = knn.predict_proba(x_train)

In [None]:
knn.score(x_train, y_train)

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
knn.score(x_validate, y_validate)

In [None]:
# Reports for the training set

accuracy = knn.score(x_train, y_train)
y_pred = knn.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
# Reports for validate set
y_pred = knn.predict(x_validate)

accuracy = knn.score(x_validate, y_validate)
y_pred = knn.predict(x_validate)
conf = confusion_matrix(y_validate, y_pred)
class_report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True)).T
conf = confusion_matrix(y_validate, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
# Do same with k=10

In [None]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_train)

In [None]:
# Reports for the training set

accuracy = knn.score(x_train, y_train)
y_pred = knn.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
y_pred = knn.predict(x_validate)

In [None]:
# Reports for validate set
y_pred = knn.predict(x_validate)

accuracy = knn.score(x_validate, y_validate)
y_pred = knn.predict(x_validate)
conf = confusion_matrix(y_validate, y_pred)
class_report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True)).T
conf = confusion_matrix(y_validate, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
# k = 20

In [None]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_train)

In [None]:
# Reports for the training set

accuracy = knn.score(x_train, y_train)
y_pred = knn.predict(x_train)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
y_pred = knn.predict(x_validate)

In [None]:
# Reports for validate set

accuracy = knn.score(x_validate, y_validate)
y_pred = knn.predict(x_validate)
conf = confusion_matrix(y_validate, y_pred)
class_report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True)).T
conf = confusion_matrix(y_validate, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy: .4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
# The metrics go down as the number of neighbors increases. This seems to be the case for the in-sample and 
# out-of-sample datasets, but it appears to be relatively consistent in drop for both.

In [None]:
######################################    Logistic Regression

In [None]:
# 1. Create a model that includes age in addition to fare and pclass, does this perform better than baseline?

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
titanic = prepare.prep_titanic(titanic)

In [27]:
titanic.head()
# Handle missing ages
avg_age = titanic.age.mean()
titanic.age = titanic.age.fillna(avg_age)

In [34]:
# Checking for nulls
titanic.isna().sum()

survived                   0
pclass                     0
age                        0
sibsp                      0
parch                      0
fare                       0
alone                      0
sex_male                   0
embark_town_Queenstown     0
embark_town_Southampton    0
dtype: int64

In [39]:
train, validate, test = train_validate_test_split(titanic, 'survived', seed=123)

In [40]:
x_train = train.drop(columns='survived')
y_train = train.survived
x_validate = validate.drop(columns='survived')
y_validate = validate.survived
x_test = test.drop(columns='survived')
y_test = test.survived

In [41]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [42]:
baseline_acc = (train.survived == 0).mean()
round(baseline_acc, 2)

0.62

In [46]:
from sklearn.linear_model import LogisticRegression

# Create the logistic regression
logit = LogisticRegression(random_state=123)

# Specify the features we're using
features = ['age', 'pclass', 'fare']

# Fit model using these features
logit.fit(x_train[features], y_train)

LogisticRegression(random_state=123)

In [47]:
# Predict on same subset that you fit on
y_pred = logit.predict(x_train[features])

In [48]:
logit.score(x_train[features], y_train)

0.7028112449799196

In [49]:
# Does beat the baseline (62%)

In [50]:
# 2. Include sex in the model as well. 

In [51]:
features2 = ['age', 'pclass', 'fare', 'sex_male']
logit = LogisticRegression(random_state=123)
logit.fit(x_train[features2], y_train)
y_pred = logit.predict(x_train[features2])

In [52]:
logit.score(x_train[features2], y_train)

0.8132530120481928

In [53]:
# This is a better result than the baseline and model1

In [54]:
# 3. Try out some other features and models

In [62]:
# Let's just try it with all the features
logit = LogisticRegression(random_state=123)

In [64]:
logit.fit(x_train, y_train)
y_pred = logit.predict(x_train)
logit.score(x_train, y_train)

0.8152610441767069

In [65]:
# Using just age and gender
f3 = ['age', 'sex_male']
logit.fit(x_train[f3], y_train)

LogisticRegression(random_state=123)

In [66]:
y_pred = logit.predict(x_train[f3])

In [67]:
logit.score(x_train[f3], y_train)

0.7991967871485943

In [68]:
# Age, gender, and fare
f4 = ['age', 'sex_male', 'fare']
logit.fit(x_train[f4], y_train)

LogisticRegression(random_state=123)

In [69]:
y_pred = logit.predict(x_train[f4])

In [70]:
logit.score(x_train[f4], y_train)

0.7931726907630522

In [71]:
# Just fare and pclass
f5 = ['fare', 'pclass']
logit.fit(x_train[f5], y_train)
y_pred = logit.predict(x_train[f5])
logit.score(x_train[f5], y_train)

0.6666666666666666

In [72]:
# 4. Use best 3 models to predict and evaluate validate sample

In [74]:
features2 = ['age', 'pclass', 'fare', 'sex_male']
logit.fit(x_train[features2], y_train)
y_pred = logit.predict(x_validate[features2])

In [75]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214



In [76]:
# all features
logit.fit(x_train, y_train)
y_pred = logit.predict(x_validate)
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       132
           1       0.73      0.65      0.68        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



In [77]:
f3 = ['age', 'sex_male']
logit.fit(x_train[f3], y_train)
y_pred = logit.predict(x_validate[f3])

In [78]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



In [79]:
# 5. Choose best model and use it on test set (features2)

In [80]:
logit.fit(x_train[features2], y_train)

LogisticRegression(random_state=123)

In [81]:
y_pred = logit.predict(x_test[features2])

In [82]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       110
           1       0.77      0.71      0.74        69

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [83]:
# Results improved on the test set