In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('insurance2.csv')

df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1338 non-null   int64  
 1   sex             1338 non-null   int64  
 2   bmi             1338 non-null   float64
 3   children        1338 non-null   int64  
 4   smoker          1338 non-null   int64  
 5   region          1338 non-null   int64  
 6   charges         1338 non-null   float64
 7   insuranceclaim  1338 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 83.8 KB


(   age  sex     bmi  children  smoker  region      charges  insuranceclaim
 0   19    0  27.900         0       1       3  16884.92400               1
 1   18    1  33.770         1       0       2   1725.55230               1
 2   28    1  33.000         3       0       2   4449.46200               0
 3   33    1  22.705         0       0       1  21984.47061               0
 4   32    1  28.880         0       0       1   3866.85520               1,
 None)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

X = df.drop("insuranceclaim", axis=1)
y = df["insuranceclaim"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

results = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
results["Logistic Regression"] = {
    "accuracy": accuracy_score(y_test, lr_pred),
    "cm": confusion_matrix(y_test, lr_pred)
}

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)
results["KNN"] = {
    "accuracy": accuracy_score(y_test, knn_pred),
    "cm": confusion_matrix(y_test, knn_pred)
}

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
results["Decision Tree"] = {
    "accuracy": accuracy_score(y_test, dt_pred),
    "cm": confusion_matrix(y_test, dt_pred)
}

results


{'Logistic Regression': {'accuracy': 0.8626865671641791,
  'cm': array([[117,  22],
         [ 24, 172]])},
 'KNN': {'accuracy': 0.8805970149253731,
  'cm': array([[120,  19],
         [ 21, 175]])},
 'Decision Tree': {'accuracy': 0.9671641791044776,
  'cm': array([[131,   8],
         [  3, 193]])}}