In [194]:
from numpy import mean,std 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import os
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from skmultilearn.adapt import MLkNN
from IPython.display import display 
import xgboost as xgb 
import plotly.express as px
import plotly.graph_objects as go 
import sklearn.metrics as metrics 
from sklearn.linear_model import LinearRegression

# Data preprocessing 
# Identify outliers
def findOutlier(list_name): 
    s1 = [list_name[i][6] for i in range(len(list_name))]
    data_mean = mean(s1)
    data_std = std(s1)
    cut_off = data_std * 3
    lower = data_mean - cut_off 
    upper = data_mean + cut_off    
    df = pd.DataFrame(list_name, columns=['row_id', 'c_labor', 'begin_date', 'end_date', 'education', 'years_experience', 'current_year', 'next_year', 'second_year', 'allwyn_job'])
    df = df.drop(df[(df.current_year > upper) | (df.current_year < lower)].index)
    return df


conn = sqlite3.connect('calc.db')
c = conn.cursor()

# Get data 
test = c.execute("SELECT * FROM predictive_modeling_tbl;").fetchall()
s1 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'IT Project Manager I';").fetchall()
s2 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'IT Project Manager III';").fetchall()
s3 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Senior Computer Security Systems Specialist';").fetchall()
s4 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Senior Security Analyst';").fetchall()
s5 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Cloud Engineer';").fetchall()
s6 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Senior Data Scientist';").fetchall()
s7 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'User Experience (UX) Developer';").fetchall()
s8 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Software Developer I';").fetchall()
s9 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Test Automation Engineer ';").fetchall()
s10 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Senior Java Developer';").fetchall()
s11 = c.execute("SELECT * FROM predictive_modeling_tbl WHERE jd_matching_allwyn = 'Software Developer III';").fetchall()


d1 = findOutlier(s1)
d2 = findOutlier(s2)
d3 = findOutlier(s3)
d4 = findOutlier(s4)
d5 = findOutlier(s5)
d6 = findOutlier(s6)
d7 = findOutlier(s7)
d8 = findOutlier(s8)
d9 = findOutlier(s9)
d10 = findOutlier(s10)
d11 = findOutlier(s11)

df = pd.concat([d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11])


df['next_year'] = pd.to_numeric(df['next_year'], errors='coerce')
df['next_year'] = df['next_year'].fillna(df['next_year'].median())
df['second_year'] = pd.to_numeric(df['second_year'], errors='coerce')
df['second_year'] = df['second_year'].fillna(df['second_year'].median()) 

In [195]:
d1.head(5)

Unnamed: 0,row_id,c_labor,begin_date,end_date,education,years_experience,current_year,next_year,second_year,allwyn_job
0,32,Assistant Project Coordinator,6/7/18,6/6/23,Bachelors,2,27.21,,,IT Project Manager I
1,181,Department Manager I,7/5/18,7/4/23,Bachelors,8,36.3,37.1,37.92,IT Project Manager I
2,247,Project Administrator I,6/7/18,6/6/23,Bachelors,4,38.07,,,IT Project Manager I
3,502,Buyer III,3/13/12,3/12/23,Bachelors,5,43.29,,,IT Project Manager I
4,554,Engineer I,10/23/19,10/22/24,Bachelors,0,44.1,44.98,45.88,IT Project Manager I


In [196]:
# Modeling
X = d1[['allwyn_job','years_experience','education']]
y = d1['current_year']

X = pd.get_dummies(X)
np.random.seed(2)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

#'next_year','second_year'
#'allwyn_job','years_experience','education',

#X = X.drop(['allwyn_job_Senior Java Developer'], axis=1) 

In [197]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('Intercept: ', lm.intercept_)
print('Coefficients: ', lm.coef_)


print('RMSE: ', np.sqrt(np.mean((lm.predict(X_test) - y_test)**2)), '$/hr')
errors = abs(lm.predict(X_test) - y_test)
print('MAE', round(np.mean(errors),2), '$/hr')
mape = 100 * (errors / y_test)

accuracy = 100 - np.mean(mape)
print('accuracy: ', round(accuracy, 2), '%')
print('R-squared: ', lm.score(X_test, y_test))

Intercept:  115.34119510702719
Coefficients:  [ 5.54509307e+00  7.99360578e-14 -2.88713544e+01 -1.31710544e+01
  4.20424088e+01]
RMSE:  37.22423039472113 $/hr
MAE 29.3 $/hr
accuracy:  74.1 %
R-squared:  0.39393956414277054


In [198]:
coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
years_experience,5.545093
allwyn_job_IT Project Manager I,7.993606e-14
education_Bachelors,-28.87135
education_Masters,-13.17105
education_Ph.D.,42.04241


In [199]:
ic = lm.intercept_
ic 

115.34119510702719

Get input 

In [200]:
X.shape, y.shape

((448, 5), (448,))

In [201]:
new_input = [[3,1,1,0,0]]

In [202]:
new_output = lm.predict(new_input)
print('current year:',new_output, '$/hr')

current year: [103.10511993] $/hr


Future prediction 

In [203]:
nd1 = df[df['allwyn_job'] == 'IT Project Manager I']

In [204]:
nd1 = nd1[['current_year','next_year','second_year']]

In [205]:
nd1.head(5)

Unnamed: 0,current_year,next_year,second_year
0,27.21,111.28,112.84
1,36.3,37.1,37.92
2,38.07,111.28,112.84
3,43.29,111.28,112.84
4,44.1,44.98,45.88


In [206]:
nd1.dtypes

current_year    float64
next_year       float64
second_year     float64
dtype: object

In [207]:
corr = nd1.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,current_year,next_year,second_year
current_year,1.0,0.94,0.87
next_year,0.94,1.0,0.92
second_year,0.87,0.92,1.0


In [208]:
X1 = nd1['current_year']
y1 = nd1['next_year'] 
X1 = X1.values.reshape(-1,1)
y1 = y1.values.reshape(-1,1)

In [209]:
np.random.seed(2)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size = 0.3) 

In [210]:
lm1 = linear_model.LinearRegression()
lm1.fit(X1_train, y1_train)

print('Intercept: ', lm1.intercept_)
print('Coefficients: ', lm1.coef_)

print('RMSE: ', np.sqrt(np.mean((lm1.predict(X1_test) - y1_test)**2)), '$/hr')
errors = abs(lm1.predict(X1_test) - y1_test)
print('MAE', round(np.mean(errors),2), '$/hr')
mape = 100 * (errors / y1_test)

accuracy = 100 - np.mean(mape)
print('accuracy: ', round(accuracy, 2), '%')
print('R-squared: ', lm1.score(X1_test, y1_test))  

Intercept:  [11.55183454]
Coefficients:  [[0.90694511]]
RMSE:  12.363740340015319 $/hr
MAE 7.17 $/hr
accuracy:  94.4 %
R-squared:  0.9319135494088913


In [211]:
new_input1 = new_output.reshape(-1,1)
final_output = lm1.predict(new_input1)
print('second year:',final_output, '$/hr')

second year: [[105.06251913]] $/hr


In [212]:
X2 = nd1['second_year']
y2 = nd1['next_year'] 
X2 = X2.values.reshape(-1,1)
y2 = y2.values.reshape(-1,1)

In [213]:
np.random.seed(2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,test_size = 0.3) 

In [214]:
lm2 = linear_model.LinearRegression()
lm2.fit(X2_train, y2_train)

print('Intercept: ', lm2.intercept_)
print('Coefficients: ', lm2.coef_)

print('RMSE: ', np.sqrt(np.mean((lm2.predict(X2_test) - y2_test)**2)), '$/hr')
errors = abs(lm2.predict(X2_test) - y2_test)
print('MAE', round(np.mean(errors),2), '$/hr')
mape = 100 * (errors / y2_test)

accuracy = 100 - np.mean(mape)
print('accuracy: ', round(accuracy, 2), '%')
print('R-squared: ', lm2.score(X2_test, y2_test))  

Intercept:  [2.13110018]
Coefficients:  [[0.97179223]]
RMSE:  17.813278750632332 $/hr
MAE 6.6 $/hr
accuracy:  94.52 %
R-squared:  0.8586653422714686


In [215]:
new_input2 = final_output.reshape(-1,1)
final_output1 = lm2.predict(new_input2)
print('third year:',final_output1, '$/hr')

third year: [[104.23003972]] $/hr


In [216]:
def linear_model(new_output,X1,y1): 
    X1 = X1.values.reshape(-1,1)
    y1 = y1.values.reshape(-1,1)
    np.random.seed(2)
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size = 0.3) 
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    y_pred = lm.predict(X_test)

    print('Intercept: ', lm.intercept_)
    print('Coefficients: ', lm.coef_)


    print('RMSE: ', np.sqrt(np.mean((lm.predict(X_test) - y_test)**2)), '$/hr')
    errors = abs(lm.predict(X_test) - y_test)
    print('MAE', round(np.mean(errors),2), '$/hr')
    mape = 100 * (errors / y_test)

    accuracy = 100 - np.mean(mape)
    print('accuracy: ', round(accuracy, 2), '%')
    print('R-squared: ', lm.score(X_test, y_test)) 
    
    new_input1 = new_output.reshape(-1,1)
    final_output = lm1.predict(new_input1)
    print(final_output)
    return final_output

In [217]:
Xsec = nd1['current_year']
ysec = nd1['next_year'] 
linear_model(new_output,Xsec,ysec)

AttributeError: 'function' object has no attribute 'LinearRegression'