In [14]:
import pandas as pd
import numpy as np
import joblib
import os

# Scikit-Learn Mod√ºlleri
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Veriyi Y√ºkle
current_dir = os.getcwd()
file_path = os.path.join(current_dir, "/Users/connor/insurance-expense-prediction-mlops", "data", "raw", "insurance.csv")
df = pd.read_csv(file_path)
df.head()
print(f"Veri Seti Boyutu: {df.shape}")
df.head()

Veri Seti Boyutu: (1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
y = df['charges']

X = df.drop(columns='charges')

categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

# Eƒüitim ve Test Setinin Ayrƒ±lmasƒ± (%80 Eƒüitim, %20 Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"Eƒüitim Seti: {X_train.shape}")
print(f"Test Seti: {X_test.shape}")


Eƒüitim Seti: (1070, 6)
Test Seti: (268, 6)


In [16]:
# Gelecekte veride bo≈üluk olursa yapƒ±yƒ± bozmadan √ñnce bo≈üluklarƒ± medyan ile doldur, sonra √∂l√ßeklendirir.
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                                      ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [12]:
models = { "LinearRegression" : LinearRegression(),
           "RandomForestRegressor" : RandomForestRegressor(n_estimators= 100, random_state= 42)}
results = {}
best_model = None
best_score = -np.inf
for name, model in models.items():
    # Pipeline Olu≈üturma: √ñnce Preprocessing -> Sonra Model
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results[name] = {"R2" : r2, "RMSE" : rmse}
    print(f"--- {name} ---")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}\n")

    if r2 > best_score:
        best_score = r2
        best_model = clf

print(f"üèÜ En Ba≈üarƒ±lƒ± Model: {name} (R2: {best_score})") 



--- LinearRegression ---
R2 Score: 0.7836
RMSE: 5796.2847

--- RandomForestRegressor ---
R2 Score: 0.8656
RMSE: 4567.7565

üèÜ En Ba≈üarƒ±lƒ± Model: RandomForestRegressor (R2: 0.865606633433326)


In [None]:
if not os.path.exists("../models"):
    os.makedirs("../models")      

model_path = "../models/insurance_model_pipline.joblib"

joblib.dump(best_model, model_path) 

['../models/insurance_model_pipline.joblib']