# Sigorta Masrafı Tahmini (Insurance Charges Prediction)

## Problem
Bir sigorta şirketi, müşterinin bazı özelliklerine bakarak yıllık sağlık/sigorta masrafını (**charges**) tahmin etmek istiyor.
Bu bir **regresyon** problemidir.

## Veri Sözlüğü (Sütunlar neyi ifade ediyor?)
- **age**: Müşterinin yaşı (tam sayı)
- **sex**: Müşterinin cinsiyeti (kategorik: `female`, `male`)
- **bmi**: Body Mass Index / Vücut Kitle İndeksi (ondalıklı sayı)
- **children**: Bakmakla yükümlü olduğu çocuk sayısı (tam sayı)
- **smoker**: Sigara içiyor mu? (kategorik: `yes`, `no`)
- **region**: Yaşadığı bölge (kategorik: `northeast`, `northwest`, `southeast`, `southwest`)
- **charges**: Yıllık sağlık/sigorta masrafı (hedef değişken — modelin tahmin edeceği değer)

In [2]:
#gerekli kütüphanelerin yüklenmesi
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import matplotlib.pyplot as plt


In [3]:
#csv yüklenip okunması
df = pd.read_csv("insurance.csv")

print("Satır, sütun:", df.shape)
df.head()


Satır, sütun: (1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
#veri kalitesi
n_rows, n_cols = df.shape
missing_cells = int(df.isna().sum().sum())
missing_rows = int(df.isna().any(axis=1).sum())
dup_rows = int(df.duplicated().sum())

#basit kpi'lar
charges = df["charges"]

kpis = {
    "records": n_rows,
    "features": n_cols - 1,           # charges hedef varsayıldı
    "missing_cells": missing_cells,
    "missing_rows": missing_rows,
    "duplicate_rows": dup_rows,
    "age_mean": float(df["age"].mean()),
    "age_min": int(df["age"].min()),
    "age_max": int(df["age"].max()),
    "bmi_mean": float(df["bmi"].mean()),
    "charges_mean": float(charges.mean()),
    "charges_median": float(charges.median()),
    "charges_std": float(charges.std()),
    "charges_p90": float(charges.quantile(0.90)),
    "charges_p95": float(charges.quantile(0.95)),
    "charges_p99": float(charges.quantile(0.99)),
    "charges_max": float(charges.max()),
    "smoker_rate": float((df["smoker"] == "yes").mean()),
    "obese_rate_bmi_ge_30": float((df["bmi"] >= 30).mean()),
}

kpi_df = (
    pd.DataFrame(kpis.items(), columns=["KPI", "Value"])
    .assign(Value=lambda d: d["Value"].round(4))
)

kpi_df

Unnamed: 0,KPI,Value
0,records,1338.0
1,features,6.0
2,missing_cells,0.0
3,missing_rows,0.0
4,duplicate_rows,1.0
5,age_mean,39.207
6,age_min,18.0
7,age_max,64.0
8,bmi_mean,30.6634
9,charges_mean,13270.4223


In [5]:
df_dash = df.copy()

# Obez mi?
df_dash["is_obese"] = (df_dash["bmi"] >= 30).astype(int)

# BMI
bmi_bins = [0, 18.5, 25, 30, 35, 40, np.inf]
bmi_labels = ["Underweight(<18.5)", "Normal(18.5-24.9)", "Overweight(25-29.9)",
              "Obese I(30-34.9)", "Obese II(35-39.9)", "Obese III(40+)"]
df_dash["bmi_bucket"] = pd.cut(df_dash["bmi"], bins=bmi_bins, labels=bmi_labels, right=False)

# Yaş bandı
age_bins = [18, 26, 36, 46, 56, 65]
age_labels = ["18-25", "26-35", "36-45", "46-55", "56-64"]
df_dash["age_band"] = pd.cut(df_dash["age"], bins=age_bins, labels=age_labels, right=False)

df_dash[["age","age_band","bmi","bmi_bucket","smoker","is_obese","charges"]].head()


Unnamed: 0,age,age_band,bmi,bmi_bucket,smoker,is_obese,charges
0,19,18-25,27.9,Overweight(25-29.9),yes,0,16884.924
1,18,18-25,33.77,Obese I(30-34.9),no,1,1725.5523
2,28,26-35,33.0,Obese I(30-34.9),no,1,4449.462
3,33,26-35,22.705,Normal(18.5-24.9),no,0,21984.47061
4,32,26-35,28.88,Overweight(25-29.9),no,0,3866.8552


In [6]:
#maliyet yoğunluğu

def top_cost_share(df_in: pd.DataFrame, top_q: float) -> float:
    """
    top_q: 0.90 -> top 10% (charges >= p90)
    """
    thr = df_in["charges"].quantile(top_q)
    return float(df_in.loc[df_in["charges"] >= thr, "charges"].sum() / df_in["charges"].sum())

concentration = pd.DataFrame({
    "segment": ["Top 5%", "Top 10%", "Top 20%"],
    "cost_share": [top_cost_share(df_dash, 0.95),
                   top_cost_share(df_dash, 0.90),
                   top_cost_share(df_dash, 0.80)]
})
concentration["cost_share"] = (concentration["cost_share"] * 100).round(2)
concentration

#segmentte bulunan yüzde -- cost_share kadar charges'ı kapsıyor

Unnamed: 0,segment,cost_share
0,Top 5%,17.58
1,Top 10%,31.98
2,Top 20%,51.69


In [7]:
total_cost = df_dash["charges"].sum()

seg_smoker = (
    df_dash.groupby("smoker", as_index=False)
    .agg(
        n=("charges", "size"),
        mean_charges=("charges", "mean"),
        sum_charges=("charges", "sum")
    )
)

seg_smoker["pct_records"] = seg_smoker["n"] / len(df_dash)
seg_smoker["cost_share"] = seg_smoker["sum_charges"] / total_cost

seg_smoker.sort_values("sum_charges", ascending=False)


Unnamed: 0,smoker,n,mean_charges,sum_charges,pct_records,cost_share
0,no,1064,8434.268298,8974061.0,0.795217,0.505415
1,yes,274,32050.231832,8781764.0,0.204783,0.494585


In [8]:
# Flag kolonlar
df_dash["is_smoker"] = (df_dash["smoker"] == "yes").astype(int)
df_dash["is_obese"]  = (df_dash["bmi"] >= 30).astype(int)

total_cost = df_dash["charges"].sum()

region_profile_simple = (
    df_dash.groupby("region", as_index=False)
    .agg(
        n=("charges", "size"),
        mean_charges=("charges", "mean"),
        sum_charges=("charges", "sum"),
        mean_bmi=("bmi", "mean"),
        smoker_rate=("is_smoker", "mean"),
        obese_rate=("is_obese", "mean")
    )
)

# cost share
region_profile_simple["cost_share"] = region_profile_simple["sum_charges"] / total_cost

# Oranları yüzdeye çevir (dashboard’da daha okunur)
for c in ["smoker_rate", "obese_rate", "cost_share"]:
    region_profile_simple[c] = region_profile_simple[c] * 100

# Yuvarla (dashboard formatı)
region_profile_simple = region_profile_simple.round({
    "mean_charges": 0,
    "sum_charges": 0,
    "mean_bmi": 2,
    "smoker_rate": 2,
    "obese_rate": 2,
    "cost_share": 2
})

# Sırala ve döndür
region_profile_simple = region_profile_simple.sort_values("mean_charges", ascending=False)

region_profile_simple


Unnamed: 0,region,n,mean_charges,sum_charges,mean_bmi,smoker_rate,obese_rate,cost_share
2,southeast,364,14735.0,5363690.0,33.36,25.0,66.76,30.21
0,northeast,324,13406.0,4343669.0,29.17,20.68,44.14,24.46
1,northwest,325,12418.0,4035712.0,29.2,17.85,45.54,22.73
3,southwest,325,12347.0,4012755.0,30.6,17.85,53.23,22.6


In [9]:
#sigara kullananların yüzdesel ifadesi
counts = df["smoker"].value_counts()
perc = (counts / len(df) * 100).round(2)

counts, perc

(smoker
 no     1064
 yes     274
 Name: count, dtype: int64,
 smoker
 no     79.52
 yes    20.48
 Name: count, dtype: float64)

In [10]:
#cinsiyet ve bölgenin yüzdesel ifadesi
print((df["sex"].value_counts(normalize=True) * 100).round(2))
print((df["region"].value_counts(normalize=True) * 100).round(2))

sex
male      50.52
female    49.48
Name: proportion, dtype: float64
region
southeast    27.20
southwest    24.29
northwest    24.29
northeast    24.22
Name: proportion, dtype: float64


In [11]:
#sigara kullanımına göre masraf değişimi
df.groupby("smoker")["charges"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,1064.0,8434.268298,5993.781819,1121.8739,3986.4387,7345.4053,11362.88705,36910.60803
yes,274.0,32050.231832,11541.547176,12829.4551,20826.244213,34456.34845,41019.207275,63770.42801


In [12]:
#sigara kullanımına göre masraf farkı
g = df.groupby("smoker")["charges"]
mean_no, mean_yes = g.mean()["no"], g.mean()["yes"]
median_no, median_yes = g.median()["no"], g.median()["yes"]

print("Ortalama fark:", round(mean_yes - mean_no, 2))
print("Ortalama oran:", round(mean_yes / mean_no, 2))

print("Medyan fark:", round(median_yes - median_no,2))
print("Medyan oran:", round(median_yes / median_no, 2))

Ortalama fark: 23615.96
Ortalama oran: 3.8
Medyan fark: 27110.94
Medyan oran: 4.69


In [13]:
#cinsiyete göre sigara kullanımı
ct = pd.crosstab(df["sex"], df["smoker"], normalize="index") * 100
ct.round(2)

smoker,no,yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,82.63,17.37
male,76.48,23.52


In [14]:
# Hedef kolon = tahmin edeceğimiz değer
target_col = "charges"

# X: özellikler (features), y: hedef (target)
X = df.drop(columns=[target_col])
y = df[target_col]

# Kategorik kolonlar
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# Sayısal kolonlar (int, float)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

Categorical columns: ['sex', 'smoker', 'region']
Numeric columns: ['age', 'bmi', 'children']


In [15]:
# Train/test ayırma: modeli değerlendirmek için
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# OneHotEncoder: kategorik değişkenleri sayısala çevirir
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [16]:
# Linear Regression
lr = Pipeline(steps=[
    ('prep', preprocess),
    ('model', LinearRegression())
])

lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)

print("LR MAE :", round(mean_absolute_error(y_test, pred_lr),1))
print("LR RMSE:", round(np.sqrt(mean_squared_error(y_test, pred_lr)),1))
print("LR R2  :", round(r2_score(y_test, pred_lr),1))

LR MAE : 4181.2
LR RMSE: 5796.3
LR R2  : 0.8


In [17]:
#random forest ile reg tahmin:
rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    min_samples_leaf=2
)

rf_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", rf)
])

rf_pipe.fit(X_train, y_train)
pred_rf = rf_pipe.predict(X_test)

print("RF MAE :", round(mean_absolute_error(y_test, pred_rf), 1))
print("RF RMSE:", round(np.sqrt(mean_squared_error(y_test, pred_rf)), 1))
print("RF R2  :", round(r2_score(y_test, pred_rf), 2))

RF MAE : 2437.8
RF RMSE: 4457.0
RF R2  : 0.87


In [18]:
def bmi_from_height_weight(height_cm, weight_kg):
    h_m = height_cm / 100
    return weight_kg / (h_m ** 2)

In [19]:
bmi = bmi_from_height_weight(170, 80)

one = pd.DataFrame([{
    "age": 31,
    "sex": "female",
    "bmi": bmi,
    "children": 0,
    "smoker": "no",
    "region": "southwest"
}])

In [20]:
round((lr.predict(one)[0]),0)

np.float64(4557.0)

In [21]:
round((rf_pipe.predict(one)[0]),0)

np.float64(5182.0)

In [22]:
print(df["charges"].mean())

13270.422265141257
