# Data Categories

The data is seperated in these categories; 
 
`ResponseId | MainBranch | Employment | RemoteWork| EdLevel| YearsCode | OrgSize | Country | Age | Gender | Trans | Sexuality | Ethnicity | Accessibility | MentalHealth | WorkExp | ConvertedCompYearly | C# | C | HTML/CSS,
  | Python | Dart | Bash/Shell | JavaScript | Java | Haskell | Assembly | Go | Groovy | Crystal | PHP | Delphi | C++ | Clojure | Kotlin | APL | Rust | TypeScript | COBOL | PowerShell | Scala | Elixir | F# | LISP | Ruby | Julia | MATLAB | Objective-C | Erlang | Swift | Fortran | OCaml | R | SQL | Perl | Lua | VBA`
  
Our Inputs for the model are;

`MainBranch | Employment | RemoteWork| EdLevel| YearsCode | OrgSize | Country | Age | Gender | Trans | Sexuality | Ethnicity | Accessibility | WorkExp`
  
Our Outputs for the model will be;

`ConvertedCompYearly`

# Importing data and libraries

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl

sns.set()

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix, classification_report

In [65]:
df = pd.read_csv("CleanedData.csv")
df

Unnamed: 0.1,Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,OrgSize,Country,Age,...,Objective-C,Erlang,Swift,Fortran,OCaml,R,SQL,Perl,Lua,VBA
0,11,12,1,1,1,1,3,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1,12,13,2,2,2,1,3,2,1,2,...,0,0,1,0,0,0,1,0,0,0
2,14,15,2,1,1,2,3,3,1,2,...,0,0,0,0,0,1,0,0,0,0
3,21,22,2,2,2,2,2,1,2,2,...,0,0,0,0,0,0,1,0,0,0
4,22,23,2,2,1,3,4,4,3,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21285,73110,73111,2,2,1,1,3,1,3,2,...,0,0,0,0,0,0,0,0,0,0
21286,73112,73113,2,2,2,1,2,5,24,2,...,0,0,0,0,0,0,0,0,0,0
21287,73113,73114,2,2,1,2,2,5,1,2,...,0,0,0,0,0,0,0,0,0,0
21288,73116,73117,2,2,2,1,4,9,1,1,...,0,0,0,0,0,0,0,0,0,0


In [66]:
df['ConvertedCompYearly']

0        194400
1         65000
2        110000
3         34126
4         97605
          ...  
21285     60906
21286     52255
21287     94000
21288    115000
21289     70000
Name: ConvertedCompYearly, Length: 21290, dtype: int64

# Define Input and Output

In [67]:
# Input data 
X = df[['MainBranch','Employment','RemoteWork','EdLevel','YearsCode','OrgSize','Country','Age','Gender','Trans','Sexuality','Ethnicity','Accessibility','WorkExp']]

# Output data 
y = df["ConvertedCompYearly"]

X = np.asarray(X).astype('float32')
y = np.asarray(y).astype('float32')



# Split data into training and test sets, 70% and 30%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Modeling


In [68]:
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV,ElasticNet,ElasticNetCV,LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import neighbors
from sklearn.svm import SVR

## Chosing a model we will use

In [69]:
knn=KNeighborsRegressor().fit(X_train,y_train)
ada=AdaBoostRegressor().fit(X_train,y_train)
svm=SVR().fit(X_train,y_train)
ridge=Ridge().fit(X_train,y_train)
lasso=Lasso().fit(X_train,y_train)
rf=RandomForestRegressor().fit(X_train,y_train)
gbm=GradientBoostingRegressor().fit(X_train,y_train)

models=[ridge,lasso,knn,ada,svm,rf,gbm]

def ML(Y,models):
    y_pred=models.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mean_squared_error(y_test,y_pred))
    r2=r2_score(y_test,y_pred)*100
    
    return mse,rmse,r2

for i in models:
    print("\n",i,"\n\nDifferent models success rate :",ML("salary_in_usd",i))


 Ridge() 

Different models success rate : (3050713900.0, 55233.27, 28.32680377971578)

 Lasso() 

Different models success rate : (3050701000.0, 55233.152, 28.32711658514182)

 KNeighborsRegressor() 

Different models success rate : (2776862700.0, 52695.945, 34.76064709813911)

 AdaBoostRegressor() 

Different models success rate : (3263360319.2028055, 57125.82882727222, 23.33091211620357)

 SVR() 

Different models success rate : (4518816877.12214, 67222.14573429013, -6.164669051161864)

 RandomForestRegressor() 

Different models success rate : (2641608982.098438, 51396.58531554833, 37.93828097639968)

 GradientBoostingRegressor() 

Different models success rate : (2250212944.359357, 47436.409480054, 47.13370357138847)


# Hyperparameter Tuning of Gradient Boosting

In [70]:
from sklearn.ensemble import GradientBoostingRegressor

In [71]:
gbr_best = GradientBoostingRegressor()

In [72]:
search_grid={ 'n_estimators': [500], 'learning_rate':[0.001,0.01,1],
            'max_depth':[1,2,4], 'subsample':[0.5,0.75,1], 'random_state' : [1]}

search = GridSearchCV(estimator = gbr_best, param_grid = search_grid,
                     scoring = 'neg_mean_squared_error', cv = 2, n_jobs = 1)

In [73]:
search.fit(X_train, y_train)
print(search.best_params_)


{'learning_rate': 1, 'max_depth': 1, 'n_estimators': 500, 'random_state': 1, 'subsample': 1}


In [74]:
gbm_best=GradientBoostingRegressor(learning_rate = 1, max_depth = 1, n_estimators = 3000, random_state = 1, subsample = 1).fit(X_train,y_train)

y_pred=gbm_best.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)*100


print("\n GradientBoostingRegressor \n\nDifferent models success rate :", r2)


 GradientBoostingRegressor 

Different models success rate : 48.2440120070609


# Saving Model

In [76]:
import pickle

file = open('Salary_prediction_model.pkl', 'wb')
pkl.dump(gbm_best, file)

# Load the saved model
model = pickle.load(open('Salary_prediction_model.pkl', 'rb'))