In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error



#Models
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor



In [2]:
df = pd.read_csv("clean_data.csv")

In [3]:
df.head()

Unnamed: 0,working_hour,sprint,sprint_tasks_awaiting,tasks_completed,tasks_delayed,tasks_incomplete,team_score_avg,monthly_salary,title
0,2055,20,0,37,1,1,8,5248,Human Resources
1,2015,23,3,36,2,3,9,3072,Software Developer
2,2133,26,2,26,0,4,6,3158,Quality Analysis and Testing
3,1930,23,0,36,1,2,9,5625,Software Developer
4,1907,26,1,31,0,2,7,3852,Database Admin


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   working_hour           720 non-null    int64 
 1   sprint                 720 non-null    int64 
 2   sprint_tasks_awaiting  720 non-null    int64 
 3   tasks_completed        720 non-null    int64 
 4   tasks_delayed          720 non-null    int64 
 5   tasks_incomplete       720 non-null    int64 
 6   team_score_avg         720 non-null    int64 
 7   monthly_salary         720 non-null    int64 
 8   title                  506 non-null    object
dtypes: int64(8), object(1)
memory usage: 50.8+ KB


In [5]:
df.head()

Unnamed: 0,working_hour,sprint,sprint_tasks_awaiting,tasks_completed,tasks_delayed,tasks_incomplete,team_score_avg,monthly_salary,title
0,2055,20,0,37,1,1,8,5248,Human Resources
1,2015,23,3,36,2,3,9,3072,Software Developer
2,2133,26,2,26,0,4,6,3158,Quality Analysis and Testing
3,1930,23,0,36,1,2,9,5625,Software Developer
4,1907,26,1,31,0,2,7,3852,Database Admin


## Preprocessing

In [6]:
lbe = LabelEncoder()
df["title"] = lbe.fit_transform(df["title"])

In [7]:
X = df.loc[:, df.columns != 'monthly_salary']
y = df.loc[:,'monthly_salary']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.10, 
                                                    random_state=42)

In [9]:
#Outlier
cols = X_train.select_dtypes(["int64"]).columns
for row in cols:
    Q1 = X_train[row].quantile(0.25)
    Q3 = X_train[row].quantile(0.75)
    IQR = Q3-Q1
    lower_band = Q1- 1.5*IQR
    upper_band = Q3 + 1.5*IQR
    filter_iqr = (X_train[row] < lower_band) | ((X_train[row] > upper_band))
    filter_iqr.sum()
    print(filter_iqr.sum())

0
0
0
0
0
0
0


In [10]:
X_train.head()

Unnamed: 0,working_hour,sprint,sprint_tasks_awaiting,tasks_completed,tasks_delayed,tasks_incomplete,team_score_avg,title
81,1904,21,1,34,1,3,5,9
382,2090,20,4,36,2,4,7,3
148,2023,21,3,32,1,0,8,4
536,1959,21,0,37,2,4,5,3
581,2107,26,4,27,2,1,6,4


In [11]:
scaler = StandardScaler()
columns = X_train.columns
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = columns)

## Models

In [12]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

883.2413308816434


In [14]:
knn_model = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

1039.6522878489855

In [15]:
svr_model = SVR().fit(X_train, y_train)
y_pred = svr_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

884.4515372275679

In [16]:
cart_model = DecisionTreeRegressor()
cart_model.fit(X_train, y_train)
y_pred = cart_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

1343.535758114883

In [17]:
rf_model = RandomForestRegressor(random_state = 42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

889.9744189402612

In [18]:
lgbm = LGBMRegressor()
lgbm_model = lgbm.fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))


1003.5229704540069

## Fine Tuning Best Algorithm

In [19]:
X_train

Unnamed: 0,working_hour,sprint,sprint_tasks_awaiting,tasks_completed,tasks_delayed,tasks_incomplete,team_score_avg,title
0,-1.739194,-0.982993,-0.616965,0.679661,0.050458,0.620729,-1.471163,0.890265
1,0.439032,-1.481022,1.458088,1.013819,1.261440,1.325164,-0.311414,-0.924811
2,-0.345598,-0.982993,0.766403,0.345503,0.050458,-1.492576,0.268460,-0.622299
3,-1.095095,-0.982993,-1.308650,1.180898,1.261440,1.325164,-1.471163,-0.924811
4,0.638117,1.507153,1.458088,-0.489892,1.261440,-0.788141,-0.891289,-0.622299
...,...,...,...,...,...,...,...,...
643,-1.551819,0.511095,0.766403,-1.158208,-1.160525,0.620729,-0.891289,-0.924811
644,-0.579816,1.507153,0.074719,-0.991129,0.050458,-0.788141,0.848335,0.587752
645,-1.001408,-0.484964,0.074719,-0.991129,-1.160525,1.325164,-0.311414,-0.924811
646,1.036287,-0.484964,-1.308650,0.345503,1.261440,0.620729,0.268460,1.192778


In [20]:
svr_params = {"degree":[2,3,4,5] ,"C": np.arange(0.1,2,0.1)}
svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 10).fit(X_train,y_train)
print(pd.Series(svr_cv_model.best_params_))
svr_tuned = SVR(kernel ="linear", 
                C = pd.Series(svr_cv_model.best_params_)[0],degree = 2).fit(X_train, y_train)
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

C         0.1
degree    2.0
dtype: float64


883.9519980889748

## Model Saving

In [21]:
import pickle

In [22]:
pickle.dump(svr_tuned, open("SVRmodel", 'wb'))
pickle.dump(scaler, open("scaler", 'wb'))
pickle.dump(lbe, open("laberEncoder", 'wb'))