<h1><center>SGDRegressor with Pipeline on "Salary Dataset"</center></h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
employee_data = pd.read_csv("../../Data/regression_employee_data.csv")

In [3]:
employee_data.head()

Unnamed: 0,Emp #,Designation,Experience,Salary,AggrBehavScore,ActivityScore,LinesOfCode,Qualification,Degree,YearsOfStudy,EducationalInstitute,Gender,Empl_Band
0,e1,Director,16.0,85861,0.526928,5,1678,Grad,,15.0,clg,M,A
1,e2,Jr.Dev,3.0,25376,6.420421,18,6082,Grad,Btech,16.0,clg,F,C
2,e3,Sr.Dev,6.0,38193,2.802337,7,2927,PostGrad,Msc,17.0,unv,M,B
3,e4,Jr.Dev,3.0,22807,9.374888,23,10249,,Mtech,18.0,,F,C
4,e5,Sr.Dev,4.0,37624,3.336564,10,3159,,Btech,16.0,,F,B


#### Split data into Train and Test

In [4]:
shuffleSplit = ShuffleSplit(n_splits=1, test_size=0.2, random_state=33)
for train_index, test_index in shuffleSplit.split(employee_data):
    train_set = employee_data.loc[train_index]
    test_set = employee_data.loc[test_index]

In [5]:
train_set.shape

(1209, 13)

In [6]:
test_set.shape

(303, 13)

#### Data Pre-Processing

In [7]:
cat_list_dict = {'Designation': ['Sr.Dev', 'Analyst', 'Sr.Analyst', 'Manager', 'Jr.Dev', 'Director', 'Sr.Manager'],
                'Qualification': ['Grad', 'PostGrad', 'Phd', 'postdoc'],
                'Degree': ['Btech', 'Bsc', 'Msc', 'Mba', 'IntMsc', 'B.A', 'Mtech'],
                'EducationalInstitute' : ['unv', 'iit', 'clg', 'cunv'],
                'Gender': ['M', 'F'],
                'Empl_Band': ['B', 'C', 'A']}

In [8]:
def convertCategorisToCodes(columns, df):
    for column in columns:
        df[column] = pd.Categorical(df[column], categories = cat_list_dict[column])
        df[column] = df[column].cat.codes

In [9]:
YearsOfStudy_impute_value = train_set['YearsOfStudy'].mode().values[0]
Qualification_impute_value = train_set['Qualification'].mode().values[0]
Degree_impute_value = train_set['Degree'].mode().values[0]
EducationalInstitute_impute_value = train_set['EducationalInstitute'].mode().values[0]

In [10]:
train_set.loc[train_set[train_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0
train_set.drop('Emp #', axis=1, inplace=True)
train_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)
train_set['Qualification'].fillna(Qualification_impute_value, inplace=True)
train_set['Degree'].fillna(Degree_impute_value, inplace=True)
train_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)
convertCategorisToCodes(train_set.select_dtypes(['object']).columns.values, train_set)

In [11]:
train_X = train_set.loc[:, [x for x in train_set.columns if x not in ['Salary']] ]
train_y = train_set['Salary']

#### SGDRegressor - model

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sc = StandardScaler()
std_train_X = sc.fit_transform(train_X)

In [13]:
sgd = SGDRegressor(random_state=42)
sgd.fit(std_train_X, train_y)


train_prediction = sgd.predict(std_train_X)
train_mse = mean_squared_error(train_y, train_prediction)
r2_train = r2_score(train_y, train_prediction)

print("Train R-Square score: ", r2_train)
print("Train Root Mean Squared Error : ", np.sqrt(train_mse))

Train R-Square score:  0.6281080908599284
Train Root Mean Squared Error :  11780.243649839898


#### SGDRegressor - Hyperparameter tuning

SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)[source]

In [22]:
##############################################
# Do we need max_iter=1000 ?, test it with less number of iterations
# Is model converging at eta0=0.01 ? try out playing with eta0 may be a big number 0.1 and a small number 0.001 or 0.0001
##############################################
sgd = SGDRegressor(random_state=42, eta0=0.01, max_iter=100)
sgd.fit(std_train_X, train_y)

train_prediction = sgd.predict(std_train_X)
train_mse = mean_squared_error(train_y, train_prediction)
r2_train = r2_score(train_y, train_prediction)

print("Train R-Square score: ", r2_train)
print("Train Root Mean Squared Error : ", np.sqrt(train_mse))

Train R-Square score:  0.6281080908599284
Train Root Mean Squared Error :  11780.243649839898


In [23]:
test_set.loc[test_set[test_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0
test_set.drop('Emp #', axis=1, inplace=True)
test_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)
test_set['Qualification'].fillna(Qualification_impute_value, inplace=True)
test_set['Degree'].fillna(Degree_impute_value, inplace=True)
test_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)
convertCategorisToCodes(test_set.select_dtypes(['object']).columns.values, test_set)
test_X = test_set.loc[:, [x for x in test_set.columns if x not in ['Salary']]]
test_y = test_set['Salary']

std_test_X = sc.transform(test_X)

In [24]:
test_prediction = sgd.predict(std_test_X)
test_mse = mean_squared_error(test_y, test_prediction)
r2_test = r2_score(test_y, test_prediction)

print("Test R-Square score: ", r2_test)
print("Test Root Mean Squared Error : ", np.sqrt(test_mse))

Test R-Square score:  0.660359036619117
Test Root Mean Squared Error :  11553.388596344645


#### We know that 3rd degree polynomial is the best model, lets do K-Fold Cross Validation using "GridSearchCV" Model Selection

In [28]:
import warnings
warnings.filterwarnings("ignore")

poly_lr_reg_pipeline = Pipeline([
        ("poly_features", PolynomialFeatures()),
        ("std_scaling", StandardScaler()),
        ("sgd_reg", SGDRegressor(random_state=42))
    ])

param_grid = {
        "poly_features__degree": [3],
        "sgd_reg__eta0": np.linspace(0.01, 0.001, 20),
        "sgd_reg__max_iter": [300, 400]
    }

grid_search = GridSearchCV(poly_lr_reg_pipeline, param_grid, cv=20, scoring='r2')
grid_search.fit(train_X, train_y)

print("Best params : ", grid_search.best_params_)
print("Best estimator : ", grid_search.best_estimator_)

Best params :  {'poly_features__degree': 3, 'sgd_reg__eta0': 0.003842105263157895, 'sgd_reg__max_iter': 300}
Best estimator :  Pipeline(memory=None,
         steps=[('poly_features',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('std_scaling',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('sgd_reg',
                 SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
                              epsilon=0.1, eta0=0.003842105263157895,
                              fit_intercept=True, l1_ratio=0.15,
                              learning_rate='invscaling', loss='squared_loss',
                              max_iter=300, n_iter_no_change=5, penalty='l2',
                              power_t=0.25, random_state=42, shuffle=True,
                              tol=0.001, validation_fraction=0.1, verbose=0,
                  

In [29]:
train_prediction = grid_search.best_estimator_.predict(train_X)
train_mse = mean_squared_error(train_y, train_prediction)
r2_train = r2_score(train_y, train_prediction)

print("Train R-Square score: ", r2_train)
print("Train Root Mean Squared Error : ", np.sqrt(train_mse))

Train R-Square score:  0.9219325751234801
Train Root Mean Squared Error :  5397.353150771882


#### Apply BEST model to TRAIN and TEST

In [30]:
test_prediction = grid_search.best_estimator_.predict(test_X)
test_mse = mean_squared_error(test_y, test_prediction)
r2_test = r2_score(test_y, test_prediction)

print("Test R-Square score: ", r2_test)
print("Test Root Mean Squared Error : ", np.sqrt(test_mse))

Test R-Square score:  0.9133776003342751
Test Root Mean Squared Error :  5834.645229649296
