<h1><center>Model Selection -GridSearchCV Approach</center></h1>

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [24]:
employee_data = pd.read_csv("../../Data/regression_employee_data.csv")

In [25]:
employee_data.head()

Unnamed: 0,Emp #,Designation,Experience,Salary,AggrBehavScore,ActivityScore,LinesOfCode,Qualification,Degree,YearsOfStudy,EducationalInstitute,Gender,Empl_Band
0,e1,Director,16.0,85861,0.526928,5,1678,Grad,,15.0,clg,M,A
1,e2,Jr.Dev,3.0,25376,6.420421,18,6082,Grad,Btech,16.0,clg,F,C
2,e3,Sr.Dev,6.0,38193,2.802337,7,2927,PostGrad,Msc,17.0,unv,M,B
3,e4,Jr.Dev,3.0,22807,9.374888,23,10249,,Mtech,18.0,,F,C
4,e5,Sr.Dev,4.0,37624,3.336564,10,3159,,Btech,16.0,,F,B


#### Split data into Train and Test

In [26]:
shuffleSplit = ShuffleSplit(n_splits=1, test_size=0.2, random_state=33)
for train_index, test_index in shuffleSplit.split(employee_data):
    train_set = employee_data.loc[train_index]
    test_set = employee_data.loc[test_index]

In [27]:
train_set.shape

(1209, 13)

In [28]:
test_set.shape

(303, 13)

#### Data Pre-Processing

In [29]:
cat_list_dict = {'Designation': ['Sr.Dev', 'Analyst', 'Sr.Analyst', 'Manager', 'Jr.Dev', 'Director', 'Sr.Manager'],
                'Qualification': ['Grad', 'PostGrad', 'Phd', 'postdoc'],
                'Degree': ['Btech', 'Bsc', 'Msc', 'Mba', 'IntMsc', 'B.A', 'Mtech'],
                'EducationalInstitute' : ['unv', 'iit', 'clg', 'cunv'],
                'Gender': ['M', 'F'],
                'Empl_Band': ['B', 'C', 'A']}

In [30]:
def convertCategorisToCodes(columns, df):
    for column in columns:
        df[column] = pd.Categorical(df[column], categories = cat_list_dict[column])
        df[column] = df[column].cat.codes

In [31]:
YearsOfStudy_impute_value = train_set['YearsOfStudy'].mode().values[0]
Qualification_impute_value = train_set['Qualification'].mode().values[0]
Degree_impute_value = train_set['Degree'].mode().values[0]
EducationalInstitute_impute_value = train_set['EducationalInstitute'].mode().values[0]

In [32]:
train_set.loc[train_set[train_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0
train_set.drop('Emp #', axis=1, inplace=True)
train_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)
train_set['Qualification'].fillna(Qualification_impute_value, inplace=True)
train_set['Degree'].fillna(Degree_impute_value, inplace=True)
train_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)
convertCategorisToCodes(train_set.select_dtypes(['object']).columns.values, train_set)

In [33]:
train_X = train_set.loc[:,['Designation', 'Experience', 'AggrBehavScore', 'ActivityScore', 'LinesOfCode', 'Qualification',
 'Degree', 'YearsOfStudy', 'EducationalInstitute', 'Gender', 'Empl_Band']]
train_y = train_set['Salary']

### Model Selection Process - GridSearchCV

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#### How Cross Validation works ?

In [35]:
poly_lr_reg_pipeline = Pipeline([
        ("poly_features", PolynomialFeatures()),
        ("lr", LinearRegression())
    ])

param_grid = {
        "poly_features__degree": [1,2,3,4]
    }

grid_search = GridSearchCV(poly_lr_reg_pipeline, param_grid, cv=3, scoring='r2')
grid_search.fit(train_X, train_y)

#print("Best params : ", grid_search.best_params_)
print("Best estimator : ", grid_search.best_estimator_)

Best estimator :  Pipeline(memory=None,
         steps=[('poly_features',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('lr',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)


In [36]:
train_prediction = grid_search.best_estimator_.predict(train_X)
train_mse = mean_squared_error(train_y, train_prediction)
r2_train = r2_score(train_y, train_prediction)

print("Train R-Square score: ", r2_train)
print("Train Root Mean Squared Error : ", np.sqrt(train_mse))

Train R-Square score:  0.986900328761218
Train Root Mean Squared Error :  2210.935668970049


#### Apply BEST model to TRAIN and TEST

In [37]:
test_set.loc[test_set[test_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0
test_set.drop('Emp #', axis=1, inplace=True)
test_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)
test_set['Qualification'].fillna(Qualification_impute_value, inplace=True)
test_set['Degree'].fillna(Degree_impute_value, inplace=True)
test_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)
convertCategorisToCodes(test_set.select_dtypes(['object']).columns.values, test_set)
test_X = test_set.loc[:,['Designation', 'Experience', 'AggrBehavScore', 'ActivityScore', 'LinesOfCode', 'Qualification',
 'Degree', 'YearsOfStudy', 'EducationalInstitute', 'Gender', 'Empl_Band']]
test_y = test_set['Salary']

In [38]:
test_prediction = grid_search.best_estimator_.predict(test_X)
test_mse = mean_squared_error(test_y, test_prediction)
r2_test = r2_score(test_y, test_prediction)

print("Test R-Square score: ", r2_test)
print("Test Root Mean Squared Error : ", np.sqrt(test_mse))

Test R-Square score:  0.9817350649589138
Test Root Mean Squared Error :  2679.22023738637
