# Linear Regression Models

### 1. Simple Linear Regression

In [56]:
import numpy as np
import pandas as pd
import statsmodels
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [57]:
df = pd.read_excel('CDAC_DataBook.xlsx',sheet_name = 'faithful')
df.head()

Unnamed: 0,eruptions,waiting
0,3.6,79
1,1.8,54
2,3.333,74
3,2.283,62
4,4.533,85


In [60]:
x_train = df[['waiting']]

In [61]:
y_train = df[['eruptions']]

In [62]:
x_train.head()

Unnamed: 0,waiting
0,79
1,54
2,74
3,62
4,85


In [63]:
y_train.head()

Unnamed: 0,eruptions
0,3.6
1,1.8
2,3.333
3,2.283
4,4.533


In [64]:
#add constant term to the predictor column
x_train = sm.add_constant(x_train,prepend = False)

In [65]:
mod = sm.OLS(y_train,x_train).fit()

In [66]:
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:              eruptions   R-squared:                       0.811
Model:                            OLS   Adj. R-squared:                  0.811
Method:                 Least Squares   F-statistic:                     1162.
Date:                Thu, 29 Jun 2023   Prob (F-statistic):          8.13e-100
Time:                        14:40:13   Log-Likelihood:                -194.51
No. Observations:                 272   AIC:                             393.0
Df Residuals:                     270   BIC:                             400.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
waiting        0.0756      0.002     34.089      0.0

- p-value of waiting is greater than 0.05
- So null hypothesis of waiting is not rejected.
- waiting is dependent on erruptions
- As waiting will increases erruptions will also be increases.

### 2. Multiple Linear Regression

In [96]:
df = pd.read_excel('CDAC_DataBook.xlsx',sheet_name = 'stackloss')
df.head()

Unnamed: 0,AirFlow,WaterTemp,AcidConc,StackLoss
0,80,27,89,42
1,80,27,88,37
2,75,25,90,37
3,62,24,87,28
4,62,22,87,18


In [97]:
x_train = df.drop('StackLoss',axis=1)

In [98]:
y_train = df[['StackLoss']]

In [99]:
x_train = df[['AirFlow','WaterTemp','AcidConc']]

In [100]:
x_train.head()

Unnamed: 0,AirFlow,WaterTemp,AcidConc
0,80,27,89
1,80,27,88
2,75,25,90
3,62,24,87
4,62,22,87


In [101]:
y_train.head()

Unnamed: 0,StackLoss
0,42
1,37
2,37
3,28
4,18


In [102]:
x_train = sm.add_constant(x_train,prepend = False)

In [103]:
mod = sm.OLS(y_train,x_train).fit()

In [104]:
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:              StackLoss   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.898
Method:                 Least Squares   F-statistic:                     59.90
Date:                Thu, 29 Jun 2023   Prob (F-statistic):           3.02e-09
Time:                        14:52:11   Log-Likelihood:                -52.288
No. Observations:                  21   AIC:                             112.6
Df Residuals:                      17   BIC:                             116.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
AirFlow        0.7156      0.135      5.307      0.0

In [20]:
x_test = sm.add_constant(x_test,prepend = False) # actual output

In [21]:
y_pred = mod.predict(x_test) #predicted output

In [22]:
y_pred[:5] #prediction

4     19.497851
0     38.117397
11     9.473814
1     38.269171
18     9.887887
dtype: float64

In [23]:
y_test[:5] #actual output

4     18
0     42
11    13
1     37
18     9
Name: StackLoss, dtype: int64

### 3. Categorical Regression Model

In [106]:
df = pd.read_excel('CDAC_DataBook.xlsx',sheet_name = 'salaries')
df.head()

Unnamed: 0,rank,discipline,yrs_phd,yrs_service,gender,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [107]:
df = df[['rank','yrs_service','salary']] #rank and yrs_service are predictor
df.head()

Unnamed: 0,rank,yrs_service,salary
0,Prof,18,139750
1,Prof,16,173200
2,AsstProf,3,79750
3,Prof,39,115000
4,Prof,41,141500


In [108]:
rank_dummy = pd.get_dummies(df['rank'],drop_first = True)
rank_dummy.head(10)

Unnamed: 0,AsstProf,Prof
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
5,0,0
6,0,1
7,0,1
8,0,1
9,0,1


In [109]:
df = df.drop('rank',axis = 1)

In [110]:
df = pd.concat([df,rank_dummy],axis = 1)
df.head()

Unnamed: 0,yrs_service,salary,AsstProf,Prof
0,18,139750,0,1
1,16,173200,0,1
2,3,79750,1,0
3,39,115000,0,1
4,41,141500,0,1


In [111]:
x_train = df.drop('salary',axis = 1)

In [112]:
y_train = df[['salary']]

In [113]:
x_train = df[['yrs_service','AsstProf','Prof']]

In [114]:
x_train = sm.add_constant(x_train,prepend = False)

In [115]:
mod = sm.OLS(y_train,x_train).fit()

In [116]:
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     86.30
Date:                Thu, 29 Jun 2023   Prob (F-statistic):           6.48e-43
Time:                        14:53:55   Log-Likelihood:                -4558.8
No. Observations:                 397   AIC:                             9126.
Df Residuals:                     393   BIC:                             9142.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
yrs_service  -158.1353    114.952     -1.376      