In [9]:
import pandas as pd
import statsmodels.formula.api as sfa

In [2]:
Salary_Data = pd.read_csv("Salary_Data.csv")

In [3]:
Salary_Data

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


## Problem Statement : Salary_hike -> Build a prediction model for Salary_hike

From question:
1. Input : Years of experience 
2. Output : Salary

## Data understanding 

In [4]:
Salary_Data.shape

(30, 2)

In [6]:
Salary_Data.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [7]:
Salary_Data.isnull().sum()

YearsExperience    0
Salary             0
dtype: int64

In [8]:
Salary_Data.dtypes

YearsExperience    float64
Salary             float64
dtype: object

## Model Building 

In [10]:
lr_model = sfa.ols(('Salary~YearsExperience'),data=Salary_Data).fit()
lr_model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.955
Method:,Least Squares,F-statistic:,622.5
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,1.14e-20
Time:,21:44:59,Log-Likelihood:,-301.44
No. Observations:,30,AIC:,606.9
Df Residuals:,28,BIC:,609.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.579e+04,2273.053,11.347,0.000,2.11e+04,3.04e+04
YearsExperience,9449.9623,378.755,24.950,0.000,8674.119,1.02e+04

0,1,2,3
Omnibus:,2.14,Durbin-Watson:,1.648
Prob(Omnibus):,0.343,Jarque-Bera (JB):,1.569
Skew:,0.363,Prob(JB):,0.456
Kurtosis:,2.147,Cond. No.,13.2


In [11]:
lr_model.params

Intercept          25792.200199
YearsExperience     9449.962321
dtype: float64

## Model Testing

In [15]:
Test_data = pd.DataFrame({'YearsExperience' : [1.9,5.2,5,4,3.6,4.8,5.6,7,2.2,1.2] })

In [16]:
Test_data

Unnamed: 0,YearsExperience
0,1.9
1,5.2
2,5.0
3,4.0
4,3.6
5,4.8
6,5.6
7,7.0
8,2.2
9,1.2


In [18]:
round(lr_model.predict(Test_data),2)

0    43747.13
1    74932.00
2    73042.01
3    63592.05
4    59812.06
5    71152.02
6    78711.99
7    91941.94
8    46582.12
9    37132.15
dtype: float64

## Model Evaluation 

In [19]:
lr_model.aic , lr_model.rsquared

(606.882316930432, 0.9569566641435086)

## Model Deployment

In [20]:
from pickle import dump

In [21]:
dump(lr_model,open('Salary Hike.pkl','wb'))

In [22]:
from pickle import load

In [23]:
loaded_model = load(open('Salary Hike.pkl','rb'))

In [24]:
loaded_model.predict(Test_data)

0    43747.128609
1    74932.004270
2    73042.011806
3    63592.049484
4    59812.064556
5    71152.019342
6    78711.989199
7    91941.936449
8    46582.117306
9    37132.154984
dtype: float64