## Regression Analysis using Least Squares 
### Libraries used 
- matplotlib
- pandas
- numpy

In [1]:
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np 

df = pd.read_csv('insurance.csv')
df 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Information about the data

In [2]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Converting string data to numerical data
Regression models cannot deal with non-numerical data therefore we encode the string data int numbers using _sklearn.preprocessing_

In [3]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder() 
df['sex'] = label_encoder.fit_transform(df['sex'])
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462


In [4]:
label_encoder_smoking = preprocessing.LabelEncoder() 
df['smoker'] = label_encoder_smoking.fit_transform(df['smoker'])
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462


In [5]:
label_encoder_region = preprocessing.LabelEncoder() 
df['region'] = label_encoder_smoking.fit_transform(df['region'])
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462


## Implementing Linear Regression using sklearn

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [7]:
X = df.drop(columns=['charges'])
y = df['charges']

In [8]:
X.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2


In [9]:
y.head(3)

0    16884.9240
1     1725.5523
2     4449.4620
Name: charges, dtype: float64

### Fitting a line to the data 
![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

In this case it will be in 6 dimensions as there are 6 features

In [10]:
model.fit(X,y)
model.coef_

array([  257.28807486,  -131.11057962,   332.57013224,   479.36939355,
       23820.43412267,  -353.64001656])

### Looking at how the model scores using mean absolute error

![image.png](attachment:image.png)

In [11]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y,model.predict(X))

4172.487114944051

## Further implementation
- Implement the linear regression model from scratch
- Do better data pre-processing
- Implement a polynomial regression model 
- Apply regularization techniques 