In [25]:
from google.colab import drive
import os


drive.mount('/content/gdrive')


ai_project_files_path = '/content/gdrive/MyDrive/AI_Project_Files'


if not os.path.exists(ai_project_files_path):
    os.makedirs(ai_project_files_path)

print("Project folder ready at:", ai_project_files_path)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Project folder ready at: /content/gdrive/MyDrive/AI_Project_Files


In [26]:
import pandas as pd

dataset_path = os.path.join(ai_project_files_path, "insurance.csv")

dataset = pd.read_csv('/content/insurance.csv')
print(dataset.head())


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [27]:

print(dataset.isnull().sum())


dataset = dataset.fillna({
    'bmi': dataset['bmi'].median(),
    'sex': dataset['sex'].mode()[0]
})


dataset = dataset.drop_duplicates()


q1 = dataset['bmi'].quantile(0.25)
q3 = dataset['bmi'].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5*iqr
upper = q3 + 1.5*iqr
dataset = dataset[(dataset['bmi'] >= lower) & (dataset['bmi'] <= upper)]


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [28]:
X = dataset.drop("charges", axis=1)
y = dataset["charges"]


X = pd.get_dummies(X, drop_first=True)


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [30]:
import joblib
from sklearn.linear_model import LinearRegression

lin_model_path = os.path.join(ai_project_files_path, "linear_regression_model.pkl")

# Load if exists, else train
if os.path.exists(lin_model_path):
    linear_reg = joblib.load(lin_model_path)
    print("Loaded saved Linear Regression model.")
else:
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)
    joblib.dump(linear_reg, lin_model_path)
    print("Trained and saved Linear Regression model.")



Loaded saved Linear Regression model.


In [31]:
print("Weights (coefficients):", linear_reg.coef_)
print("Intercept:", linear_reg.intercept_)

Weights (coefficients): [ 2.56975706e+02  3.37092552e+02  4.25278784e+02 -1.85916916e+01
  2.36511289e+04 -3.70677326e+02 -6.57864297e+02 -8.09799354e+02]
Intercept: -11931.21905032666


In [32]:
from sklearn.tree import DecisionTreeRegressor

tree_model_path = os.path.join(ai_project_files_path, "decision_tree_model.pkl")


if os.path.exists(tree_model_path):
    decision_tree = joblib.load(tree_model_path)
    print("Loaded saved Decision Tree model.")
else:
    decision_tree = DecisionTreeRegressor(random_state=42)
    decision_tree.fit(X_train, y_train)
    joblib.dump(decision_tree, tree_model_path)
    print("Trained and saved Decision Tree model.")




Loaded saved Decision Tree model.


In [33]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred_lin = linear_reg.predict(X_test)
y_pred_tree = decision_tree.predict(X_test)

print("\nLinear Regression R2:", r2_score(y_test, y_pred_lin))
print("\nDecision Tree R2:", r2_score(y_test, y_pred_tree))



Linear Regression R2: 0.7485412200359154
Linear Regression MSE: 34952451.27397823

Decision Tree R2: 0.89552559231495
Decision Tree MSE: 14521810.073647073
