In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [75]:
data= pd.read_csv("maaslar_yeni.csv")
data.head()

Unnamed: 0,Calisan ID,unvan,UnvanSeviyesi,Kidem,Puan,maas
0,1,Cayci,1,5,70,2250
1,2,Sekreter,2,5,70,2500
2,3,Uzman Yardimcisi,3,5,70,3000
3,4,Uzman,4,5,70,4000
4,5,Proje Yoneticisi,5,5,70,5500


Preprocessing

In [76]:
data.drop(columns=["Calisan ID"], inplace=True)

#applying one-hot encoding for the categorical variable "unvan"
#data=pd.get_dummies(data,columns=["unvan"])
#data.head()

X=data.drop(columns=["maas","unvan"])#selecting independent variables as x
y=data["maas"]


In [77]:
#feature selection depending on the p-value
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X,y)

ols=sm.OLS(model.predict(X),X)
print(ols.fit().summary())
print('Linear R2 degeri')
print(r2_score(y, model.predict(X)))

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.903
Model:                            OLS   Adj. R-squared (uncentered):              0.892
Method:                 Least Squares   F-statistic:                              83.89
Date:                Sat, 09 Mar 2024   Prob (F-statistic):                    8.38e-14
Time:                        22:15:11   Log-Likelihood:                         -295.74
No. Observations:                  30   AIC:                                      597.5
Df Residuals:                      27   BIC:                                      601.7
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

Polynomial Regression

In [78]:
from sklearn.preprocessing import PolynomialFeatures
poly_features=PolynomialFeatures(degree=2)
X_poly=poly_features.fit_transform(X)
model.fit(X_poly,y)
print('poly OLS')
model2=sm.OLS(model.predict(X_poly),X)
print(model2.fit().summary())

print('Polynomial R2 degeri')
print(r2_score(y, model.predict(X_poly)))


poly OLS
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.729
Model:                            OLS   Adj. R-squared (uncentered):              0.698
Method:                 Least Squares   F-statistic:                              24.16
Date:                Sat, 09 Mar 2024   Prob (F-statistic):                    8.31e-08
Time:                        22:15:11   Log-Likelihood:                         -314.41
No. Observations:                  30   AIC:                                      634.8
Df Residuals:                      27   BIC:                                      639.0
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

Feature Scaling

In [81]:
from sklearn.preprocessing import StandardScaler
#scaling for hyperparameter sensitivity
sc1=StandardScaler()
x_scaled= sc1.fit_transform(X)
sc2=StandardScaler()
y_scaled = np.ravel(sc2.fit_transform(y.values.reshape(-1,1)))

Support Vector Regression

In [82]:
from sklearn.svm import SVR

svr_reg = SVR(kernel='rbf')
svr_reg.fit(x_scaled,y_scaled)


model3=sm.OLS(svr_reg.predict(x_scaled),x_scaled)
print(model3.fit().summary())


print('R2 score for svr')
print(r2_score(y_scaled, svr_reg.predict(x_scaled)))

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.782
Model:                            OLS   Adj. R-squared (uncentered):              0.758
Method:                 Least Squares   F-statistic:                              32.37
Date:                Sat, 09 Mar 2024   Prob (F-statistic):                    4.34e-09
Time:                        22:17:30   Log-Likelihood:                        -0.92453
No. Observations:                  30   AIC:                                      7.849
Df Residuals:                      27   BIC:                                      12.05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Desicion Tree

In [83]:
#Decision Tree Regresyon
from sklearn.tree import DecisionTreeRegressor
r_dt = DecisionTreeRegressor(random_state=0)
r_dt.fit(X,y)


print('Decision Tree OLS')
model4=sm.OLS(r_dt.predict(X),X)
print(model4.fit().summary())

print('Decision Tree R2 degeri')
print(r2_score(y, r_dt.predict(X)))


Decision Tree OLS
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.679
Model:                            OLS   Adj. R-squared (uncentered):              0.644
Method:                 Least Squares   F-statistic:                              19.08
Date:                Sat, 09 Mar 2024   Prob (F-statistic):                    7.62e-07
Time:                        22:17:35   Log-Likelihood:                         -317.95
No. Observations:                  30   AIC:                                      641.9
Df Residuals:                      27   BIC:                                      646.1
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------

Random Forest

In [84]:
from sklearn.ensemble import RandomForestRegressor
rf_reg=RandomForestRegressor(n_estimators = 10,random_state=0)
rf_reg.fit(X,y.ravel())



print('Random Forest OLS')
model5=sm.OLS(rf_reg.predict(X),X)
print(model5.fit().summary())



print('Random Forest R2 degeri')
print(r2_score(y, rf_reg.predict(X)))

Random Forest OLS
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.713
Model:                            OLS   Adj. R-squared (uncentered):              0.681
Method:                 Least Squares   F-statistic:                              22.31
Date:                Sat, 09 Mar 2024   Prob (F-statistic):                    1.79e-07
Time:                        22:17:40   Log-Likelihood:                         -316.07
No. Observations:                  30   AIC:                                      638.1
Df Residuals:                      27   BIC:                                      642.3
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------