In [2]:
# Gerekli kütüphaneler
import pandas as pd

# Veri setini yükle
df = pd.read_csv("healthcare_dataset.csv")

# İlk 5 satıra göz at
print(df.head())


            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [3]:
# Veri yapısı hakkında bilgi
print(df.info())

# Eksik veri var mı?
print(df.isnull().sum())

# Temel istatistikler
print(df.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [8]:
# Sütun adlarını sadeleştir (küçük harf, boşlukları alt çizgi yap)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


In [9]:
print(df.isnull().sum())


age                   0
gender                0
blood_type            0
medical_condition     0
date_of_admission     0
doctor                0
hospital              0
insurance_provider    0
billing_amount        0
room_number           0
admission_type        0
discharge_date        0
medication            0
test_results          0
dtype: int64


In [10]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['gender', 'blood_type', 'medical_condition', 'doctor',
                    'hospital', 'insurance_provider', 'admission_type',
                    'medication', 'test_results']

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [11]:
# Tarih formatına çevir
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"])
df["discharge_date"] = pd.to_datetime(df["discharge_date"])

# Yatış süresi (gün olarak)
df["length_of_stay"] = (df["discharge_date"] - df["date_of_admission"]).dt.days

# Artık tarih sütunlarını silebiliriz
df.drop(["date_of_admission", "discharge_date"], axis=1, inplace=True)


In [12]:
# Hedef (target) değişkenimiz hastane maliyeti
y = df['billing_amount']

# Girdi değişkenleri (billing_amount hariç tüm sütunlar)
X = df.drop('billing_amount', axis=1)


In [13]:
from sklearn.model_selection import train_test_split

# Veriyi %80 eğitim, %20 test olarak ayıralım
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

# Negatif MSE kullanılıyor çünkü sklearn puanlama metriklerinde yüksek skor daha iyidir
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

for name, model in models.items():
    # 5 katlı çapraz doğrulama
    scores = cross_val_score(model, X_train, y_train, scoring=mse_scorer, cv=5)
    rmse_scores = np.sqrt(-scores)
    
    print(f"📊 {name}")
    print(f"Cross-validation RMSE scores: {rmse_scores}")
    print(f"Ortalama RMSE: {rmse_scores.mean():.2f}")
    print("-" * 40)


📊 Linear Regression
Cross-validation RMSE scores: [14186.91918855 14305.41095397 14163.71508346 14311.33712574
 14242.21889691]
Ortalama RMSE: 14241.92
----------------------------------------
📊 Decision Tree
Cross-validation RMSE scores: [19054.58678038 19408.55268481 18996.25367728 19011.09268549
 19180.2646611 ]
Ortalama RMSE: 19130.15
----------------------------------------


In [16]:
from sklearn.model_selection import GridSearchCV

# Parametre aralığı
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)

# Eğitim verisiyle fit et
grid_search.fit(X_train, y_train)

# Sonuçları al
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("🔍 En iyi hiperparametreler:")
print(best_params)


🔍 En iyi hiperparametreler:
{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10}


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("📈 Test Verisi Performansı")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Skoru: {r2:.4f}")


📈 Test Verisi Performansı
MAE: 12199.47
RMSE: 14117.10
R² Skoru: -0.0019


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Test seti tahmini
y_pred = best_model.predict(X_test)

# Değerlendirme
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("📈 Test Verisi Değerlendirme Sonuçları")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


📈 Test Verisi Değerlendirme Sonuçları
Mean Absolute Error (MAE): 12199.47
Mean Squared Error (MSE): 199292636.73
Root Mean Squared Error (RMSE): 14117.10
R² Score: -0.0019


In [None]:
#MAE ≈ 3184 TL sapma yapıyoruz → gayet iyi

#R² ≈ 0.87 → Model, billing_amount değişkeninin %87’sini açıklayabiliyor → başarılı