In [2]:
import pandas as pd
url = "https://raw.githubusercontent.com/ogut77/DataScience/master/insurance.csv"
df = pd.read_csv(url)


In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Context in Insurance Data
This dataset is often used to predict charges based on the other variables (age, sex, bmi, children, smoker, region). For example:

Input Variables (X): age, sex, bmi, children, smoker, region (features used to make predictions).

Output Variable (y): charges (what you’re trying to predict).

Describtion of variables
1. Age
Description: The age of the individual (the insured person).
Type: Numerical (integer).
Example Values: 19, 45, 62, etc.
Role in Insurance: Age is a key factor in determining insurance charges. Older individuals often have higher medical costs (and thus higher charges) due to increased health risks.
2. Sex
Description: The gender of the individual.
Type: Categorical (text or binary).
Example Values: "male," "female"
Role in Insurance: Gender can influence insurance charges because health risks and medical expenses may differ between males and females (e.g., pregnancy-related costs for females).
3. BMI (Body Mass Index)
Description: A measure of body fat based on height and weight (calculated as weight in kg divided by height in meters squared).
Type: Numerical (float).
Example Values: 25.3, 30.1, 18.5, etc.
Role in Insurance: Higher BMI often correlates with increased health risks (e.g., obesity-related conditions like diabetes or heart disease), leading to higher insurance charges.
4. Children
Description: The number of children (dependents) covered under the individual’s insurance plan.
Type: Numerical (integer).
Example Values: 0, 1, 3, etc.
Role in Insurance: More children can increase insurance costs slightly, as it may reflect additional healthcare needs, though the effect is often less pronounced than other factors like smoking or age.
5. Smoker
Description: Indicates whether the individual smokes tobacco.
Type: Categorical (text or binary).
Example Values: "yes," "no" .
Role in Insurance: Smoking is a major factor in insurance charges. Smokers typically have much higher medical costs due to risks like lung disease or cancer, so their charges are significantly elevated.
6. Region
Description: The geographic region where the individual lives.
Type: Categorical (text).
Example Values: "northeast," "southeast," "southwest," "northwest" (common in U.S.-based datasets).
Role in Insurance: Charges can vary by region due to differences in healthcare costs, lifestyle factors, or local insurance regulations.
7. Charges
Description: The insurance charges (or premiums/costs) billed to the individual, typically in a currency like USD.
Type: Numerical (float).
Example Values: 1684.52, 11234.89, 32050.23, etc.
Role in Insurance: This is usually the target variable (output) in predictive modeling. It represents the amount the insurance company charges, influenced by all the other columns (age, sex, BMI, etc.).



In [4]:
#1. Check if there is null value in dataset df (5 pt)
pd.isnull(df).sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [6]:
#2. Assign charges to y  and others to X using df. y is output variable and X is input variables (5 pt)
y = df["charges"]
X = df.drop("charges", axis=1)
print(X)
print(y)


      age     sex     bmi  children smoker     region
0      19  female  27.900         0    yes  southwest
1      18    male  33.770         1     no  southeast
2      28    male  33.000         3     no  southeast
3      33    male  22.705         0     no  northwest
4      32    male  28.880         0     no  northwest
...   ...     ...     ...       ...    ...        ...
1333   50    male  30.970         3     no  northwest
1334   18  female  31.920         0     no  northeast
1335   18  female  36.850         0     no  southeast
1336   21  female  25.800         0     no  southwest
1337   61  female  29.070         0    yes  northwest

[1338 rows x 6 columns]
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [7]:
#3. Use  get_dummies() function from the pandas library to convert categorical variables in a DataFrame (X).
# Drop first drops the first category’s dummy variable to avoid multicollinearity (5 pt)
X = pd.get_dummies(X, drop_first=True)
print(X)

      age     bmi  children  sex_male  smoker_yes  region_northwest  \
0      19  27.900         0     False        True             False   
1      18  33.770         1      True       False             False   
2      28  33.000         3      True       False             False   
3      33  22.705         0      True       False              True   
4      32  28.880         0      True       False              True   
...   ...     ...       ...       ...         ...               ...   
1333   50  30.970         3      True       False              True   
1334   18  31.920         0     False       False             False   
1335   18  36.850         0     False       False             False   
1336   21  25.800         0     False       False             False   
1337   61  29.070         0     False        True              True   

      region_southeast  region_southwest  
0                False              True  
1                 True             False  
2                 

In [None]:
#Use following methods for the evaluation on test and train data
def evalmetric(y,ypred):
 from scipy.stats import pearsonr
 import numpy as np
 e = y - ypred
 mse_f = np.mean(e**2)
 rmse_f = np.sqrt(mse_f)
 mae_f = np.mean(abs(e))
 mape_f = 100*np.mean(abs(e/y))
 crl, _ = pearsonr(y, ypred)
 r2_f = crl*crl
 print("MSE:", mse_f)
 print("RMSE:", rmse_f)
 print("MAE:",mae_f)
 print("MAPE:",mape_f)
 print("R-Squared:", round(r2_f, 4))


In [8]:
#4.Get the correlation between X variables and y variables.(5 pt)
X.corrwith(y)



Unnamed: 0,0
age,0.299008
bmi,0.198341
children,0.067998
sex_male,0.057292
smoker_yes,0.787251
region_northwest,-0.039905
region_southeast,0.073982
region_southwest,-0.04321


In [10]:
#5.Split a dataset into 25%  of data as test data  and 75% of data as training data ( pt)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(1003, 8)
(335, 8)
(1003,)
(335,)


In [12]:
#6. Using Decision Tree and Linear Regression methods, compare the performance results on both test and training data
#to determine which one is more likely to overfit and which is more likely to underfit.
# Do you think that Lasso and Ridge regularization are more likely to improve the results of Linear model test data,
# or would Random Forest or Boosting methods are more likely to improve the results of Decison tree test data?
#Explain your reasoning.(35 pt)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

def evalmetric(y,ypred):
 from scipy.stats import pearsonr
 import numpy as np
 e = y - ypred
 mse_f = np.mean(e**2)
 rmse_f = np.sqrt(mse_f)
 mae_f = np.mean(abs(e))
 mape_f = 100*np.mean(abs(e/y))
 crl, _ = pearsonr(y, ypred)
 r2_f = crl*crl
 print("MSE:", mse_f)
 print("RMSE:", rmse_f)
 print("MAE:",mae_f)
 print("MAPE:",mape_f)
 print("R-Squared:", round(r2_f, 4))

y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Regression:")
evalmetric(y_test, y_pred_dt)

y_pred_lr = lr_model.predict(X_test)
print("\nLinear Regression:")
evalmetric(y_test, y_pred_lr)




Decision Tree Regression:
MSE: 36592010.020444416
RMSE: 6049.132997417433
MAE: 2687.4353771761193
MAPE: 29.792562242839544
R-Squared: 0.7799

Linear Regression:
MSE: 35117755.73613632
RMSE: 5926.023602394469
MAE: 4243.654116653137
MAPE: 44.468185116980976
R-Squared: 0.7676


Overfitting/Underfitting:

Decision Tree: Prone to overfitting, capturing noise in training data, leading to poor generalization to new data. High training R-squared, lower test R-squared.
Linear Regression: Prone to underfitting, failing to capture complex relationships, resulting in lower performance overall.
Lasso/Ridge vs. Random Forest/Boosting:

Lasso/Ridge: Reduce overfitting in linear models by penalizing complex coefficients, improving generalization to test data.
Random Forest/Boosting: Reduce overfitting in decision trees by combining multiple trees, improving generalization and performance.
Reasoning:

Lasso/Ridge: Directly constrain complexity in linear models.
Random Forest/Boosting: Combine multiple trees to create more robust, less overfit models.
In summary:

Decision Trees: Overfit.
Linear Regression: Underfit.
Lasso/Ridge: Improve Linear Regression.
Random Forest/Boosting: Improve Decision Trees.


In [1]:
#7. Explain performance of linear regressin on test data
# using  Root mean squared error, mean absolute error, mean absolute percentage error and R2 metric (10 pt)


Root Mean Squared Error (RMSE): Represents the average difference between the predicted and actual values in the same units as the target variable (charges in this case). A lower RMSE indicates better model performance.

Mean Absolute Error (MAE): Similar to RMSE, but it calculates the average absolute difference between predicted and actual values. It's less sensitive to outliers compared to RMSE.

Mean Absolute Percentage Error (MAPE): Represents the average percentage difference between predicted and actual values. It's useful for understanding the relative error of the model.

R-squared (R2): Represents the proportion of variance in the target variable that is explained by the model. It ranges from 0 to 1, with higher values indicating better model fit. A low R-squared might suggest underfitting.

Interpreting the results:

Higher RMSE, MAE, and MAPE: Indicate larger errors and potentially poorer performance.
Lower R-squared: Suggests that the model might be underfitting and not capturing the complexity of the data well.

In [None]:
#8. Use Random Forest and Boosting methods (XGBoost, LightGBM, and CatBoost)
#to obtain the evaluation scores on  test data.
#Which Boosting technique yielded the best performance on the test data based on the R² metric?
#Did you achieve a better result compared to Random Forest on the test data based on the R² metric?
#If there is improvement on Random forest or boosting methods over decison tree, explain  (30 pt)

In [5]:
!pip install xgboost
!pip install lightgbm
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [10]:
#8. Use Random Forest and Boosting methods (XGBoost, LightGBM, and CatBoost)
#to obtain the evaluation scores on  test data.
#Which Boosting technique yielded the best performance on the test data based on the R² metric?
#Did you achieve a better result compared to Random Forest on the test data based on the R² metric?
#If there is improvement on Random forest or boosting methods over decison tree, explain  (30 pt)

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split


url = "https://raw.githubusercontent.com/ogut77/DataScience/master/insurance.csv"
df = pd.read_csv(url)

y = df["charges"]
X = df.drop("charges", axis=1)

X = pd.get_dummies(X, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

lgbm_model = LGBMRegressor(random_state=42)
lgbm_model.fit(X_train, y_train)
y_pred_lgbm = lgbm_model.predict(X_test)

cat_model = CatBoostRegressor(random_state=42, verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)

def evalmetric(y,ypred):
 from scipy.stats import pearsonr
 import numpy as np
 e = y - ypred
 mse_f = np.mean(e**2)
 rmse_f = np.sqrt(mse_f)
 mae_f = np.mean(abs(e))
 mape_f = 100*np.mean(abs(e/y))
 crl, _ = pearsonr(y, ypred)
 r2_f = crl*crl
 print("MSE:", mse_f)
 print("RMSE:", rmse_f)
 print("MAE:",mae_f)
 print("MAPE:",mape_f)
 print("R-Squared:", round(r2_f, 4))


print("Random Forest Regression:")
evalmetric(y_test, y_pred_rf)

print("\nXGBoost Regression:")
evalmetric(y_test, y_pred_xgb)

print("\nLightGBM Regression:")
evalmetric(y_test, y_pred_lgbm)

print("\nCatBoost Regression:")
evalmetric(y_test, y_pred_cat)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 1003, number of used features: 8
[LightGBM] [Info] Start training from score 13267.935814
Random Forest Regression:
MSE: 23114410.14038345
RMSE: 4807.7448081593775
MAE: 2653.61478095801
MAPE: 30.27386314379988
R-Squared: 0.8506

XGBoost Regression:
MSE: 26433443.13176504
RMSE: 5141.346431798293
MAE: 2957.213261792119
MAPE: 34.57266690344694
R-Squared: 0.8301

LightGBM Regression:
MSE: 22005117.949021608
RMSE: 4690.961303296117
MAE: 2700.720352413941
MAPE: 33.101463593940274
R-Squared: 0.8553

CatBoost Regression:
MSE: 21340838.308785602
RMSE: 4619.614519501124
MAE: 2608.117710576109
MAPE: 30.82075085150296
R-Squared: 0.8589
