In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("insurance_ML3.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,price_range
0,19,female,27.9,0,True,southwest,16884.924,expensive
1,18,male,33.77,1,False,southeast,1725.5523,cheap
2,28,male,33.0,3,False,southeast,4449.462,cheap
3,33,male,22.705,0,False,northwest,21984.47061,expensive
4,32,male,28.88,0,False,northwest,3866.8552,cheap


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

# Prepare X and y
X = data[['age','bmi','children','smoker']]
y = data['charges']


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3, 
                                                    random_state = 6) # Holdout


baseline_model = DummyRegressor(strategy="mean") # Baseline
baseline_model.fit(X_train, y_train) # Calculate value for stratgy
baseline_model.score(X_test, y_test) # Score model based on consistently predicting the strategy

-0.001233635021205659

In [4]:
model = LinearRegression().fit(X_train, y_train) # instantiate and fit model 

model.score(X_test, y_test) # Score model

0.773425820295562

In [5]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,price_range
0,19,female,27.9,0,True,southwest,16884.924,expensive
1,18,male,33.77,1,False,southeast,1725.5523,cheap
2,28,male,33.0,3,False,southeast,4449.462,cheap
3,33,male,22.705,0,False,northwest,21984.47061,expensive
4,32,male,28.88,0,False,northwest,3866.8552,cheap


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data['price_range'])

print(le.classes_) # to check the order of classes that will be encoded

data['price_range_encoded'] = le.transform(data['price_range'])
data[['price_range', 'price_range_encoded']].head()

['cheap' 'expensive']


Unnamed: 0,price_range,price_range_encoded
0,expensive,1
1,cheap,0
2,cheap,0
3,expensive,1
4,cheap,0


In [12]:
cross_val_predict(model,
                                                                X,
                                                                data['price_range_encoded'],
                                                                cv=5,
                                                                method='predict_proba').T[1]

array([0.9276895 , 0.04188379, 0.08768931, ..., 0.02981288, 0.03104599,
       0.99423399])

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

model = LogisticRegression()

# Predict class probabilties
data['proba_cheap'], data['proba_expensive'] =cross_val_predict(model,
                                                                X,
                                                                data['price_range_encoded'],
                                                                cv=5,
                                                                method='predict_proba').T

# precision recall data
precision, recall, threshold = precision_recall_curve(data['price_range_encoded'],
                                                      data['proba_expensive']) 

print(f'precision- {precision[:5]}')
print(f'recall- {recall[:5]}')
print(f'threshold- {threshold[:5]}')

precision- [0.31390135 0.31413613 0.31437126 0.31460674 0.31484258]
recall- [1. 1. 1. 1. 1.]
threshold- [0.0234146  0.02359012 0.02403223 0.02427382 0.02464116]


In [15]:
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve

In [16]:
model = LogisticRegression()
model.fit(X, data['price_range_encoded'])

def custom_predict(X, custom_threshold):
    probs = model.predict_proba(X) # Get likelihood of each sample being classified as 0 or 1
    expensive_probs = probs[:, 1] # Only keep expensive likelihoods (1) 
    return (expensive_probs > custom_threshold).astype(int) # Boolean outcome converted to 0 or 1
    
    
updated_preds = custom_predict(X=X, custom_threshold=0.305539) # Update predictions 

print("Recall:", recall_score(data['price_range_encoded'], updated_preds)) # Rerun recall
print("Precision:", precision_score(data['price_range_encoded'], updated_preds)) # Rerun precision
print("F1 Score:", f1_score(data['price_range_encoded'], updated_preds)) # Rerun f1

Recall: 0.8071428571428572
Precision: 0.9287671232876712
F1 Score: 0.8636942675159236
