<a href="https://colab.research.google.com/github/tejatanush/Medical-Charges-Prediction/blob/main/Medical_Charges_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/tejatanush/Medical-Charges-Prediction/refs/heads/main/Health_Price_Prediction.csv")

data.head()
data.isnull().sum()
data.info()

data.drop_duplicates(inplace=True)
data.shape

In [None]:
data.columns = data.columns.str.strip()  # Removes extra spaces


In [None]:
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
cat_cols = data.select_dtypes(include="object").columns
num_cols = data.select_dtypes(exclude="object").columns

In [None]:
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()


In [None]:
ordinal_mapping = {}
for col in cat_cols:
    ordinal_mapping[col] = {val: i for i, val in enumerate(data[col].unique())}
    data[col] = data[col].map(ordinal_mapping[col])

In [None]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


In [None]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
])


In [None]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
print(data.iloc[:, -1].name)  # Should print 'charges'


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Columns in X_train:", X_train.columns)


In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


In [None]:
import pandas as pd
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)


In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

In [None]:
regressor = RandomForestRegressor(n_estimators=40, random_state=42)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [None]:
def evaluate_model(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    return mae, mse, r2

mae, mse, r2 = evaluate_model(y_test, y_pred)
print(f"MAE: {mae}, MSE: {mse}, R2 Score: {r2}")