# Use-case: An HR company has hired you as a DS/ML engg. Your job is tocreate a model that can predict the salary of the employee based on his/her yearsexperience

In [1]:
import numpy as np
import pandas as pd

In [2]:
salaryData = pd.read_csv('Salary_Data.csv')

In [9]:
salaryData.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [5]:
salaryData.dropna(inplace=True)

In [7]:
salaryData.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [8]:
#For Sklearn users:
# Rules for Regression:
# 1. Features and label must be in the form of numpy array
# 2. Features must be in 2d array
# 3. Label must be in 2d array

In [14]:
#Seperate data as features and label
features = salaryData.iloc[:,[0]].values
label = salaryData.Salary.values.reshape(-1,1)

In [17]:
#ML coding begins

# Before you initiate the coding, you must know two things from your data scientists:
# 1. Approved Significance level for the project
# 2. Timeline to develop and deploy the model



# 1. Create Train Test Split
# 2. Build the model
# 3. Check the Quality of the Model
# 4. If Satisfied, perform Deployment ; else go to step 2

In [18]:
# 1. Create Train Test Split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(features,
                label,
                test_size=0.2,
                random_state=10)

In [19]:
# 2. Build the model

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
# 3. Check the Quality of the Model
#Assume : SL = 0.05
# Rule by Prashant Nair:
# The best way to check the quality of the model is:
# 1. Ensure your test score > train score (Model must be perform best on UNKNOWN DATA !!!)
# 2. Ensure your test score >= (1 - SL)

In [21]:
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.9494673013344644
0.9816423482070255


In [23]:
#Challenge !!!! ---> Try to get the best model with target of minimum 99% accuracy
# Data Randomization always give different results !!!

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for i in range(1,100):
    X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
    model = LinearRegression()
    model.fit(X_train,y_train)
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test S {}, Train Score {}, RandomSeed {}".format(test_score,train_score,i))



Test S 0.9695039421049821, Train Score 0.9545249190394052, RandomSeed 3
Test S 0.9631182154839475, Train Score 0.9528197369259258, RandomSeed 8
Test S 0.9816423482070255, Train Score 0.9494673013344644, RandomSeed 10
Test S 0.9606215790278543, Train Score 0.9527636176933665, RandomSeed 14
Test S 0.9835849730044817, Train Score 0.9460054870434312, RandomSeed 26
Test S 0.9636425773684422, Train Score 0.9527636606684406, RandomSeed 27
Test S 0.9944092048209744, Train Score 0.9400496694274888, RandomSeed 30
Test S 0.9778242092591887, Train Score 0.9486350116716654, RandomSeed 37
Test S 0.9724794487377619, Train Score 0.9473317052697812, RandomSeed 38
Test S 0.9928344802911049, Train Score 0.9492886917497556, RandomSeed 39
Test S 0.9802519469633169, Train Score 0.9491742100347064, RandomSeed 41
Test S 0.9789129767378081, Train Score 0.948821675263085, RandomSeed 46
Test S 0.98399193890564, Train Score 0.9486450781125914, RandomSeed 47
Test S 0.980277279178695, Train Score 0.9500780390200971

In [24]:
#Final Model
X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=30)
finalModel = LinearRegression()
finalModel.fit(X_train,y_train)
print(finalModel.score(X_train,y_train))
print(finalModel.score(X_test,y_test))

0.9400496694274888
0.9944092048209744


In [25]:
# 4. If Satisfied, perform Deployment 

import pickle
pickle.dump(finalModel , open('modelSalaryPredictor.nair' , 'wb') )