In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit

In [2]:
employee_data = pd.read_csv("../../Data/regression_employee_data.csv")

In [3]:
employee_data.head()

Unnamed: 0,Emp #,Designation,Experience,Salary,AggrBehavScore,ActivityScore,LinesOfCode,Qualification,Degree,YearsOfStudy,EducationalInstitute,Gender,Empl_Band
0,e1,Director,16.0,85861,0.526928,5,1678,Grad,,15.0,clg,M,A
1,e2,Jr.Dev,3.0,25376,6.420421,18,6082,Grad,Btech,16.0,clg,F,C
2,e3,Sr.Dev,6.0,38193,2.802337,7,2927,PostGrad,Msc,17.0,unv,M,B
3,e4,Jr.Dev,3.0,22807,9.374888,23,10249,,Mtech,18.0,,F,C
4,e5,Sr.Dev,4.0,37624,3.336564,10,3159,,Btech,16.0,,F,B


* Step 1: Split data into Train and Test

In [4]:
shuffleSplit = ShuffleSplit(n_splits=1, test_size=0.2, random_state=33)
for train_index, test_index in shuffleSplit.split(employee_data):
    train_set = employee_data.loc[train_index]
    test_set = employee_data.loc[test_index]

In [5]:
train_set.shape

(1209, 13)

In [6]:
test_set.shape

(303, 13)

* Impute null

In [7]:
train_set.loc[train_set[train_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0

In [8]:
YearsOfStudy_impute_value = train_set['YearsOfStudy'].mode().values[0]
train_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)

In [9]:
Qualification_impute_value = train_set['Qualification'].mode().values[0]
train_set['Qualification'].fillna(Qualification_impute_value, inplace=True)

In [10]:
Degree_impute_value = train_set['Degree'].mode().values[0]
train_set['Degree'].fillna(Degree_impute_value, inplace=True)

In [11]:
EducationalInstitute_impute_value = train_set['EducationalInstitute'].mode().values[0]
train_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)

* Convert all categorical features to numbers

In [12]:
train_set.drop('Emp #', axis=1, inplace=True)

* Let's convert categorical columns to numbers (Label Encoding)

In [13]:
cat_list_dict = {'Designation': ['Sr.Dev', 'Analyst', 'Sr.Analyst', 'Manager', 'Jr.Dev', 'Director', 'Sr.Manager'],
                'Qualification': ['Grad', 'PostGrad', 'Phd', 'postdoc'],
                'Degree': ['Btech', 'Bsc', 'Msc', 'Mba', 'IntMsc', 'B.A', 'Mtech'],
                'EducationalInstitute' : ['unv', 'iit', 'clg', 'cunv'],
                'Gender': ['M', 'F'],
                'Empl_Band': ['B', 'C', 'A']}

In [14]:
def convertCategorisToCodes(columns, df):
    for column in columns:
        df[column] = pd.Categorical(df[column], categories = cat_list_dict[column])
        df[column] = df[column].cat.codes

In [15]:
convertCategorisToCodes(train_set.select_dtypes(['object']).columns.values, train_set)

* Let's build a Polynomial Regressin model

In [16]:
train_X = train_set.loc[:, ['Designation', 'Experience', 'AggrBehavScore', 'ActivityScore', 'LinesOfCode', 'Qualification',
 'Degree', 'YearsOfStudy', 'EducationalInstitute', 'Gender', 'Empl_Band']]
train_y = train_set['Salary']

* When we raise the dataset to "d" - degree polynomial, the number of columns in dataset becomes to (n + d)!/n! d!

In [17]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=True)
train_X_poly = poly_features.fit_transform(train_X)
train_X_poly.shape

(1209, 78)

In [18]:
poly_model = LinearRegression()
poly_model.fit(train_X_poly, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
train_prediction = poly_model.predict(train_X_poly)

In [20]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [21]:
train_r2_score = r2_score(train_y, train_prediction)
train_mse = mean_squared_error(train_y, train_prediction)

print("Train R2 Score : ", train_r2_score)
print("Train Mean Squared Error : ", train_mse)
print("Train Root Mean Squared Error : ", np.sqrt(train_mse))

Train R2 Score :  0.9089509097675514
Train Mean Squared Error :  33975622.82261561
Train Root Mean Squared Error :  5828.861194317086


* Let's test the model on TEST set

In [22]:
test_set.loc[test_set[test_set['LinesOfCode'] < 0].index.values, 'LinesOfCode'] = 0
test_set.drop('Emp #', axis=1, inplace=True)
test_set['YearsOfStudy'].fillna(YearsOfStudy_impute_value, inplace=True)
test_set['Qualification'].fillna(Qualification_impute_value, inplace=True)
test_set['Degree'].fillna(Degree_impute_value, inplace=True)
test_set['EducationalInstitute'].fillna(EducationalInstitute_impute_value, inplace=True)
convertCategorisToCodes(test_set.select_dtypes(['object']).columns.values, test_set)

In [23]:
test_X = test_set.loc[:,['Designation', 'Experience', 'AggrBehavScore', 'ActivityScore', 'LinesOfCode', 'Qualification',
 'Degree', 'YearsOfStudy', 'EducationalInstitute', 'Gender', 'Empl_Band']]
test_y = test_set['Salary']

In [24]:
test_X_poly = poly_features.transform(test_X)
test_X_poly.shape

(303, 78)

In [25]:
test_prediction = poly_model.predict(test_X_poly)

In [26]:
test_r2_score = r2_score(test_y, test_prediction)
test_mse = mean_squared_error(test_y, test_prediction)

print("TEST R2 Score : ", test_r2_score)
print("TESt Mean Squared Error : ", test_mse)
print("TEST Root Mean Squared Error : ", np.sqrt(test_mse))

TEST R2 Score :  0.9103571772225845
TESt Mean Squared Error :  35230128.041615605
TEST Root Mean Squared Error :  5935.497286800459


https://services.math.duke.edu/education/ccp/materials/mvcalc/surfaces/surf2.html