# Medical Cost Personal Insurance Project

### Step 1: Load the Dataset

In [None]:
import pandas as pd

# Load the dataset
url = 'https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Medical%20Cost%20Insurance/medical_cost_insurance.csv'
insurance_data = pd.read_csv(url)

# Display the first few rows of the dataset
insurance_data.head()

### Step 2: Data Preprocessing

In [None]:
# Check for missing values
insurance_data.isnull().sum()

# Convert categorical variables to numeric
insurance_data = pd.get_dummies(insurance_data, drop_first=True)

# Statistical summary of the dataset
insurance_data.describe()

### Step 3: Feature Engineering

In [None]:
# Check the correlation matrix
correlation_matrix = insurance_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

### Step 4: Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X = insurance_data.drop('charges', axis=1)
y = insurance_data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Linear Regression Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2 Score:", r2_score(y_test, y_pred_lr))

# Train a Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Regressor Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R2 Score:", r2_score(y_test, y_pred_rf))