# Linear Regression without lib



+ #### Import lib
+ ###### panda to data processing
+ ###### numpy to handel with matrix
+ ###### categori_encoder to handel with object type
+ ###### split_dataset to spit dataset to test and train file
+ ###### scaler to standardize transform dataset
+ ###### r2_score to calc R-squared score


In [1]:
import pandas as pd
import numpy as np
import categori_encoder as ce # local file
from numpy.linalg import inv
import split_dataset as sp #local file
import scaler #local file
import r2_score #local file
from LinearRegression import MultipleLinearRegression
import accuracy_metric #local file


#### Simple data processing(transfome category data to numberic)

In [2]:
medicine = pd.read_csv('insurance.csv')

df = pd.DataFrame(medicine)
category_map = {'sex' :{'female' : 0, 'male':1},
                'smoker':{'yes':1, 'no':0},
                'region': {'southwest':1, 'northeast':2, 'southeast':3, 'northwest':4 }
                }
df_encoded = ce.category_transform(df, category_map)
print(df_encoded)


      age  sex     bmi  children  smoker  region      charges
0      19    0  27.900         0       1       1  16884.92400
1      18    1  33.770         1       0       3   1725.55230
2      28    1  33.000         3       0       3   4449.46200
3      33    1  22.705         0       0       4  21984.47061
4      32    1  28.880         0       0       4   3866.85520
...   ...  ...     ...       ...     ...     ...          ...
1333   50    1  30.970         3       0       4  10600.54830
1334   18    0  31.920         0       0       2   2205.98080
1335   18    0  36.850         0       0       3   1629.83350
1336   21    0  25.800         0       0       1   2007.94500
1337   61    0  29.070         0       1       4  29141.36030

[1338 rows x 7 columns]


#### Check Multicollinearity collum to drop

In [3]:
print(df_encoded.corr(method='pearson'))

               age       sex       bmi  children    smoker    region   charges
age       1.000000 -0.020856  0.109272  0.042469 -0.025019 -0.008891  0.299008
sex      -0.020856  1.000000  0.046371  0.017163  0.076185 -0.000142  0.057292
bmi       0.109272  0.046371  1.000000  0.012759  0.003750  0.005598  0.198341
children  0.042469  0.017163  0.012759  1.000000  0.007673  0.001460  0.067998
smoker   -0.025019  0.076185  0.003750  0.007673  1.000000  0.013246  0.787251
region   -0.008891 -0.000142  0.005598  0.001460  0.013246  1.000000  0.015588
charges   0.299008  0.057292  0.198341  0.067998  0.787251  0.015588  1.000000


#### Splits the dataset into 80% train data and 20% test data

In [4]:
import categori_encoder as ce
train_data, test_data = sp.train_test_split(df_encoded, test_ratio=0.2)
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 267 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1071 non-null   int64  
 1   sex       1071 non-null   int64  
 2   bmi       1071 non-null   float64
 3   children  1071 non-null   int64  
 4   smoker    1071 non-null   int64  
 5   region    1071 non-null   int64  
 6   charges   1071 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 58.7 KB
None


#### standardize dataset transform

In [5]:
train_data= np.array(train_data)
test_data= np.array(test_data)
means = scaler.column_means(train_data)
stdevs = scaler.column_stdevs(train_data, means)
scaler.standardize_dataset(train_data, means, stdevs)

array([[ 1.40724927, -1.0174253 ,  0.27466065, ..., -0.49363418,
        -0.45627776,  0.11713223],
       [ 0.40314269,  0.98195543, -0.07996985, ..., -0.49363418,
        -1.36290759, -0.47807104],
       [ 0.69003029,  0.98195543, -0.78438395, ..., -0.49363418,
        -0.45627776, -0.32477198],
       ...,
       [-1.53334855, -1.0174253 ,  0.99442322, ..., -0.49363418,
         0.45035207, -0.96185573],
       [-1.31818286, -1.0174253 , -0.79084647, ..., -0.49363418,
        -1.36290759, -0.93037791],
       [ 1.55069306, -1.0174253 , -0.26253589, ...,  2.02390015,
         1.3569819 ,  1.32848203]])

In [6]:
means1 = scaler.column_means(test_data)
stdevs1 = scaler.column_stdevs(test_data, means1)
scaler.standardize_dataset(test_data, means1, stdevs1)

array([[-1.34801403, -0.97960588, -0.46044718, ...,  1.77763757,
        -1.39853396,  0.26105102],
       [-1.41708325,  1.01699541,  0.56462262, ..., -0.56043746,
         0.39239442, -0.95048809],
       [-0.72639098,  1.01699541,  0.43015861, ..., -0.56043746,
         0.39239442, -0.73279285],
       ...,
       [ 1.00033971, -0.97960588,  1.31377925, ..., -0.56043746,
         0.39239442,  0.54701116],
       [ 0.51685512,  1.01699541,  2.06293589, ...,  1.77763757,
         0.39239442,  2.60000986],
       [ 0.10243975,  1.01699541, -1.87493873, ...,  1.77763757,
         0.39239442,  0.28459533]])

In [11]:
train_data= np.array(train_data)
test_data= np.array(test_data)
X_train, y_train = train_data[:,0:6], train_data[:,6]
X_test, y_test = test_data[:,0:6], test_data[:,6]


[ 2.61051015e-01 -9.50488095e-01 -7.32792850e-01  6.68607495e-01
 -7.79354867e-01 -7.88164752e-01 -4.29805401e-01 -5.06455529e-01
 -5.76393236e-01  1.22314668e+00 -8.70906137e-01  1.13408273e+00
 -9.42392928e-01 -2.02022756e-01  2.07738275e+00 -9.41562238e-01
 -2.25469855e-01 -8.96972031e-01 -2.41050382e-01  1.85566105e+00
 -3.11431749e-02 -7.56746994e-01 -9.97524398e-01  1.92474480e+00
 -5.92577779e-01  3.05780984e-02  6.65982134e-02 -1.07883668e-01
 -8.66600731e-01  2.00539407e+00  1.75560974e+00 -9.12714836e-01
 -7.13745083e-01  1.21136769e-02  3.00308186e+00 -9.58489583e-01
  1.59334203e-01 -9.04394338e-01  2.09037126e+00  2.76162728e+00
 -8.44952775e-01 -6.92809068e-01 -5.87097220e-01 -5.83797973e-01
 -6.02506278e-01  5.60380672e-01 -8.17196962e-01 -8.04124748e-01
 -7.90113628e-02  2.00524829e+00 -9.11680597e-01 -8.02294065e-01
  7.95185124e-01  1.92799746e+00 -4.44263794e-01  2.70753201e+00
 -8.91659574e-04  1.65311945e+00  7.69332413e-01 -6.09710907e-01
 -4.00584376e-01 -7.28381

### Simple use matrix to caclculate coefficients and predict
###

In [8]:
a= MultipleLinearRegression()
a.fit(X_train, y_train)
y_pred = a.predict(X_test)

### Score R2 both with and without lib

In [9]:
r2 = r2_score.calculate_r2_score(y_test, y_pred)
print("R-squared (R2) without sklearn:", r2)


R-squared (R2) without sklearn: 0.7618456425991907


In [10]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

print("R-squared (R2) with sklearn:",reg.score(X_test, y_test))

R-squared (R2) with sklearn: 0.7618456425991906
