<a href="https://colab.research.google.com/github/suryakiran594/DS/blob/main/fcc_predict_health_costs_with_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.head

In [None]:
dataset.isnull().sum()

In [None]:
dataset.head()

In [None]:
list(dataset)

In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
#Segregate numerical & categorical Columns

dataset_numeric=dataset.columns[dataset.dtypes!='object']
dataset_categorial=dataset.columns[dataset.dtypes=='object']
print("Numerical Columns:", dataset_numeric)
print("categorial Columns:", dataset_categorial)

In [None]:
dataset['smoker'].value_counts()

In [None]:
dataset['age']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
x=0

for i in dataset_numeric:
  sns.histplot(data=dataset,x=i,kde=True)
  print('\n')
  plt.show()



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
x=0

for i in dataset_categorial:
  sns.histplot(data=dataset,x=i,kde=True)
  print('\n')
  plt.show()



## CHECK FOR THE DUPLIATED RECORDS

In [None]:
dataset.duplicated().sum()

##CORREALATION 

In [None]:
sns.heatmap(dataset.corr(),annot=True)

In [None]:
dataset.head()

In [None]:
dataset['age'].unique()

In [None]:
dataset['smoker'].unique()

In [None]:
dataset['children'].unique()

In [None]:
dataset.sort_values(by='bmi',ascending=False)

In [None]:
smoker_map={'yes':1,'no':0}

In [None]:
region_map={'northeast':1,'northwest':2,'southeast':3,'southwest':4}

In [None]:
sex_map={'male':1,'female':2}

In [None]:
dataset['smoker']=dataset['smoker'].map(smoker_map)
dataset['region']=dataset['region'].map(region_map)
dataset['sex']=dataset['sex'].map(sex_map)


In [None]:
dataset

In [None]:
group1_age = dataset[dataset['age'].between(0,35)]
group2_age = dataset[dataset['age'].between(36,45)]
group3_age = dataset[dataset['age'].between(46,55)]
group4_age = dataset[dataset['age'].between(55,65)]


In [None]:
underweight_bmi=dataset[dataset['bmi'].between(0.0,18.5)]
healthyweight_bmi=dataset[dataset['bmi'].between(18.5,25.0)]
overweight_bmi=dataset[dataset['bmi'].between(25.0,30.0)]
obesity_bmi=dataset[dataset['bmi'].between(30.0,55.5)]

In [None]:
yes_smoker=dataset[dataset['smoker']==1]
non_smoker=dataset[dataset['smoker']==0]

In [None]:
yes_smoker



In [None]:
underweight_bmi

In [None]:
group1_age

In [None]:
X=dataset.drop(labels=['expenses'],axis=1)

In [None]:
X

In [None]:
y=dataset['expenses']

In [None]:
y

In [None]:
# Define which columns should be ordinal-encoded and which should be scaled
dataset_categorial = X.select_dtypes(include='object').columns
dataset_numeric = X.select_dtypes(exclude='object').columns


In [None]:
#Define the custom ranking for each ordinal variable
age_categories=["group1_age","group2_age","group3_age","group4_age"]
smoker_categories=['non_smoker','yes_smoker']
bmi_categories=['healthyweight_bmi','underweight_bmi','overweight_bmi','obesity_bmi']

In [None]:
from sklearn.impute import SimpleImputer  # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Odinal Encoding
##PIPELINES
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')),
     ('scaler',StandardScaler())
]
)

# Categorical Pipeline

cat_pipeline=Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[age_categories,smoker_categories,bmi_categories])),
    ('scaler',StandardScaler())

    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,dataset_numeric),
    ('cat_pipeline',cat_pipeline,dataset_cat)
])

In [None]:
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [None]:
# Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [None]:
regression.coef_

In [None]:
regression.intercept_

In [None]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square


In [None]:
## Train multiple models
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    #make prediction

    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')
    

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
