In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("v2_Latest_Data_Science_Salaries.csv")

In [3]:
df.head()

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,Machine Learning Manager,Full-Time,Senior,Expert,120000,Euro,United States,129562,Germany,Large,2023
1,BI Data Analyst,Full-Time,Entry,Junior,50000,United States Dollar,Kenya,50000,Kenya,Small,2023
2,AI Engineer,Full-Time,Senior,Expert,227850,United States Dollar,United States,227850,United States,Medium,2023
3,AI Engineer,Full-Time,Senior,Expert,180500,United States Dollar,United States,180500,United States,Medium,2023
4,Data Analyst,Full-Time,Mid,Intermediate,90000,United States Dollar,United States,90000,United States,Medium,2023


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3470 entries, 0 to 3469
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           3470 non-null   object
 1   Employment Type     3470 non-null   object
 2   Experience Level    3470 non-null   object
 3   Expertise Level     3470 non-null   object
 4   Salary              3470 non-null   int64 
 5   Salary Currency     3470 non-null   object
 6   Company Location    3470 non-null   object
 7   Salary in USD       3470 non-null   int64 
 8   Employee Residence  3470 non-null   object
 9   Company Size        3470 non-null   object
 10  Year                3470 non-null   int64 
dtypes: int64(3), object(8)
memory usage: 298.3+ KB


In [5]:
df.drop("Salary in USD",axis=1,inplace=True)

In [6]:
X = df.drop(["Salary"],axis=1)
y = df["Salary"].astype("float64")

In [33]:
y.head()

0    120000.0
1     50000.0
2    227850.0
3    180500.0
4     90000.0
Name: Salary, dtype: float64

In [7]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [8]:
def col_div(df):
    cat_col = []
    num_col = []
    for x in df.columns:
        if df[x].dtype == "object":
            cat_col.append(x)
        else:
            num_col.append(x)
    return cat_col,num_col
cat_col,num_col = col_div(X)

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [10]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [11]:
processor = ColumnTransformer([
    ("num_col", num_pipeline, num_col),
    ("cat_col", cat_pipeline, cat_col)
])

In [12]:
X_train = processor.fit_transform(X_train)
X_test = processor.transform(X_test)

In [13]:
# mod = RandomForestRegressor()
# mod.fit(X_train,y_train)
# y_test_pred = mod.predict(X_test)
# print(mod.score(X_train,y_train))

In [14]:
# r2_score(y_test_pred,y_test)

In [15]:
models = {"RandomForestRegressor":RandomForestRegressor(),'Decision Tree':DecisionTreeRegressor(),'SVR':SVR()}

In [16]:
for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_pred=y_pred,y_true=y_test)
    print(name, ": ", acc)

RandomForestRegressor :  0.8422294499419642
Decision Tree :  0.7885810513566788
SVR :  -0.009844738083811366


In [17]:
regressor = RandomForestRegressor()
pram = {'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['poisson','friedman_mse','absolute_error','squared_error']
              }

In [23]:
cv=RandomizedSearchCV(regressor,param_distributions=pram,scoring='r2',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=absolute_error, max_depth=5, n_estimators=300;, score=0.687 total time= 3.5min
[CV 2/5] END criterion=absolute_error, max_depth=5, n_estimators=300;, score=0.922 total time= 3.5min
[CV 3/5] END criterion=absolute_error, max_depth=5, n_estimators=300;, score=0.048 total time= 3.1min
[CV 4/5] END criterion=absolute_error, max_depth=5, n_estimators=300;, score=0.146 total time= 3.7min
[CV 5/5] END criterion=absolute_error, max_depth=5, n_estimators=300;, score=0.553 total time= 3.1min
[CV 1/5] END criterion=squared_error, max_depth=10, n_estimators=100;, score=0.691 total time=   0.4s
[CV 2/5] END criterion=squared_error, max_depth=10, n_estimators=100;, score=0.918 total time=   0.4s
[CV 3/5] END criterion=squared_error, max_depth=10, n_estimators=100;, score=0.047 total time=   0.5s
[CV 4/5] END criterion=squared_error, max_depth=10, n_estimators=100;, score=0.150 total time=   0.4s
[CV 5/5] END criterio

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'criterion': ['poisson', 'friedman_mse',
                                                      'absolute_error',
                                                      'squared_error'],
                                        'max_depth': [3, 5, 10, None],
                                        'n_estimators': [100, 200, 300]},
                   scoring='r2', verbose=3)

In [24]:
cv.best_params_

{'n_estimators': 300, 'max_depth': None, 'criterion': 'friedman_mse'}

In [25]:
cv_pred = cv.predict(X_test)

In [26]:
r2_score(cv_pred,y_test)

0.7928021815749706

In [28]:
import pickle
with open("./model/random_forest_regressor.pkl", "wb") as f:
    pickle.dump(cv,file=f)
    f.close()
with open("./model/column_transformer.pkl", "wb") as f:
    pickle.dump(processor,file=f)