In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("./data/2021.csv")

df.columns = df.columns.str.strip()

df = df[["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedCompYearly" , "Gender" , "DevType" , "OrgSize" , "Age1stCode","LearnCode","YearsCode","LanguageHaveWorkedWith" , "DatabaseHaveWorkedWith" , "PlatformHaveWorkedWith" , "OpSys"]]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)

def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

def clean_organization(x):
    if '2 to 9 employees' in x or '10 to 19 employees' in x or 'Fewer than 10 employees' in x:
        return 'Micro'
    if '100 to 499 employees' in x or '20 to 99 employees' in x :
        return 'Small'
    if '500 to 999 employees' in x  :
        return 'Medium'
    if '1,000 to 4,999 employees' in x:
        return 'Large'
    if '10,000 or more employees' in x or '5,000 to 9,999 employees' in x:
        return 'Enterprise'
    return 'Self Employed'

df = df[df["OrgSize"].notnull()]
df['OrgSize'] = df['OrgSize'].apply(clean_organization) 
df['OrgSize'].value_counts()

df = df.loc[df['Gender'].isin(['Man' , 'Woman'])]

country_map = shorten_categories(df.DevType.value_counts(), 200)
df["DevType"] = df["DevType"].map(country_map)
df['DevType'] = df['DevType'].str.split(';')
df = df.explode('DevType')
df = df[df["DevType"].notnull()]

def clean_types(x):
    if 'Developer, full-stack'  in x :
        return 'Full Stack Developer'
    if 'Developer, back-end' in x  :
        return 'Back-End Developer'
    if 'Developer, front-end' in x :
        return 'Front-End Developer'
    if 'Developer, mobile' in x:
        return 'Mobile Developer'
    return x

df['DevType'] = df['DevType'].apply(clean_types)
df = df[df["DevType"] != "Other"]
df = df[df["DevType"] != "Other (please specify):"]
df['DevType'].value_counts()

def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df = df[df["EdLevel"].notnull()]
df['EdLevel'] = df['EdLevel'].apply(clean_education)
df['EdLevel'].value_counts()

df = df[df["Age1stCode"].notnull()]
df['Age1stCode'].value_counts()

def clean_learn_code(x): 
    if 'Books / Physical media' in x : 
        return 'Books'
    
    if 'Other online resources (ex: videos, blogs, etc)' in x :
        return 'Online resources (Videos , blogs)'
    return x

learn_code_map = shorten_categories(df.LearnCode.value_counts(), 100)
df["LearnCode"] = df["LearnCode"].map(learn_code_map)
df['LearnCode'] = df['LearnCode'].str.split(';')
df = df.explode('LearnCode')
df = df[df["LearnCode"].notnull()]
df = df[df['LearnCode'] != 'Other']
df = df[df['LearnCode'] != 'Other (please specify):']
df['LearnCode'] = df['LearnCode'].apply(clean_learn_code)
df["LearnCode"].value_counts()

def clean_years_coding(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCode'] = df['YearsCode'].apply(clean_years_coding)
df = df[df["YearsCode"].notnull()]

program_lang_map = shorten_categories(df.LanguageHaveWorkedWith.value_counts(), 500)
df["LanguageHaveWorkedWith"] = df["LanguageHaveWorkedWith"].map(program_lang_map)
df['LanguageHaveWorkedWith'] = df['LanguageHaveWorkedWith'].str.split(';')
df = df.explode('LanguageHaveWorkedWith')
df = df[df["LanguageHaveWorkedWith"].notnull()]
df = df[df['LanguageHaveWorkedWith'] != 'Other']
df["LanguageHaveWorkedWith"].value_counts()

database_lang_map = shorten_categories(df.DatabaseHaveWorkedWith.value_counts(), 1000)
df["DatabaseHaveWorkedWith"] = df["DatabaseHaveWorkedWith"].map(database_lang_map)
df['DatabaseHaveWorkedWith'] = df['DatabaseHaveWorkedWith'].str.split(';')
df = df.explode('DatabaseHaveWorkedWith')
df = df[df["DatabaseHaveWorkedWith"].notnull()]
df = df[df['DatabaseHaveWorkedWith'] != 'Other']
df["DatabaseHaveWorkedWith"].value_counts()

platform_lang_map = shorten_categories(df.PlatformHaveWorkedWith.value_counts(), 500)
df["PlatformHaveWorkedWith"] = df["PlatformHaveWorkedWith"].map(platform_lang_map)
df['PlatformHaveWorkedWith'] = df['PlatformHaveWorkedWith'].str.split(';')
df = df.explode('PlatformHaveWorkedWith')
df = df[df["PlatformHaveWorkedWith"].notnull()]
df = df[df['PlatformHaveWorkedWith'] != 'Other']
df["PlatformHaveWorkedWith"].value_counts()

ops_map = shorten_categories(df.OpSys.value_counts(), 50)
df["OpSys"] = df["OpSys"].map(ops_map)
# df['OpSys'] = df['OpSys'].str.split(';')
# df = df.explode('OpSys')
df = df[df["OpSys"].notnull()]
df = df[df['OpSys'] != 'Other']
df = df[df['OpSys'] != 'Other (please specify):']
df["OpSys"].value_counts()

df = df[df["Salary"].notnull()]

df = df.dropna()
df.isnull().sum()

df = df[df["Employment"] == "Employed full-time"]
df = df.drop("Employment", axis=1)

country_map = shorten_categories(df.Country.value_counts(), 200)
df['Country'] = df['Country'].map(country_map)

df = df[df["Salary"] <= 250000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

from sklearn.preprocessing import LabelEncoder
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df["EdLevel"].unique()

le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_devtype = LabelEncoder()
df['DevType'] = le_devtype.fit_transform(df['DevType'])

le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])

le_orgsize = LabelEncoder()
df['OrgSize'] = le_orgsize.fit_transform(df['OrgSize'])

le_Age1stCode = LabelEncoder()
df['Age1stCode'] = le_Age1stCode.fit_transform(df['Age1stCode'])

le_LearnCode = LabelEncoder()
df['LearnCode'] = le_LearnCode.fit_transform(df['LearnCode'])

le_YearsCode = LabelEncoder()
df['YearsCode'] = le_YearsCode.fit_transform(df['YearsCode'])

le_LanguageHaveWorkedWith = LabelEncoder()
df['LanguageHaveWorkedWith'] = le_LanguageHaveWorkedWith.fit_transform(df['LanguageHaveWorkedWith'])

le_DatabaseHaveWorkedWith = LabelEncoder()
df['DatabaseHaveWorkedWith'] = le_DatabaseHaveWorkedWith.fit_transform(df['DatabaseHaveWorkedWith'])

le_PlatformHaveWorkedWith = LabelEncoder()
df['PlatformHaveWorkedWith'] = le_PlatformHaveWorkedWith.fit_transform(df['PlatformHaveWorkedWith'])

le_OpSys = LabelEncoder()
df['OpSys'] = le_OpSys.fit_transform(df['OpSys'])

X = df.drop("Salary", axis=1)
y = df["Salary"]


TRAINING DATA

In [3]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=4)

METRICS

In [4]:
from sklearn.metrics import  r2_score, mean_squared_error, mean_absolute_error

LINEAR REGRESSION

In [5]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
# linear_reg.fit(X_train, y_train)
linear_reg.fit(X_train, y_train)
# y_pred = linear_reg.predict(X_test)
y_pred = linear_reg.predict(X_test)

linear_reg_r2_score = r2_score(y_test, y_pred)
linear_reg_ab_error = mean_absolute_error(y_test, y_pred)
linear_reg_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))

print("r2-score: %2f" % (linear_reg_r2_score))
print("Mean absolute error: %2f" % (linear_reg_ab_error))
print("Mean squared error: %2f" % (linear_reg_sqrt_error))

r2-score: 0.428817
Mean absolute error: 23635.676371
Mean squared error: 30674.691306


DECISION TREE REGRESSION

In [6]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X_train, y_train)
y_pred = dec_tree_reg.predict(X_test)

In [7]:
dec_tree_reg_ab_error = mean_absolute_error(y_test, y_pred)
dec_tree_reg_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))
# score = r2_score(y_test, y_pred)
dec_tree_reg_r2_score = r2_score(y_test, y_pred)
# dec_tree_reg_r2_score
print("Mean absolute error: %2f" % (dec_tree_reg_ab_error))
print("Mean squared error: %2f" % (dec_tree_reg_sqrt_error))
print("r2 score: %2f" % (dec_tree_reg_r2_score))

Mean absolute error: 59.787028
Mean squared error: 1270.900314
r2 score: 0.999020


RANDOM FOREST REGRESSION

In [8]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X_train, y_train.values)
y_pred = random_forest_reg.predict(X_test)


In [9]:
# score = r2_score(y_test, y_pred)
random_forest_reg_r2_score = r2_score(y_test, y_pred)
# random_forest_reg_r2_score
random_forest_reg_ab_error = mean_absolute_error(y_test, y_pred)
random_forest_reg_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))

print("Mean absolute error: %2f" % (random_forest_reg_ab_error))
print("Mean squared error: %2f" % (random_forest_reg_sqrt_error))
print("r2 score: %2f" % (random_forest_reg_r2_score))

Mean absolute error: 259.327686
Mean squared error: 2045.356980
r2 score: 0.997460


SHOW ACTUAL VALUES AND PREDICTED VALUES

In [10]:
check_df= pd.DataFrame(data={"actual values" : y_test,
                       "predicted values" : y_pred})
check_df

Unnamed: 0,actual values,predicted values
8345,120000.0,120000.00
74986,200000.0,200000.00
73308,28437.0,28437.00
22896,11169.0,13410.18
71413,105000.0,105000.00
...,...,...
7129,78371.0,77955.20
74305,88000.0,88000.00
56674,69800.0,69800.00
71890,105000.0,105000.00


GRID SEARCH CV

In [11]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X_train, y_train.values)
regressor = gs.best_estimator_

regressor.fit(X_train, y_train.values)
y_pred = regressor.predict(X_test)

In [12]:
gs_ab_error = mean_absolute_error(y_test, y_pred)
gs_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))
gs_score = r2_score(y_test, y_pred)
print("Mean absolute error: %2f" % (gs_ab_error))
print("Mean squared error: %2f" % (gs_sqrt_error))
print("r2 score: %2f" % (gs_score))

Mean absolute error: 59.787028
Mean squared error: 1270.900314
r2 score: 0.999020


XG BOOST REGRESSOR 

In [13]:
from xgboost import XGBRegressor
xg_reg = XGBRegressor()
xg_reg.fit(X_train,y_train.values)
y_pred = xg_reg.predict(X_test)

In [14]:
xg_reg_ab_error = mean_absolute_error(y_test, y_pred)
xg_reg_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))
xg_reg_score = r2_score(y_test, y_pred)
print("Mean absolute error: %2f" % (xg_reg_ab_error))
print("Mean squared error: %2f" % (xg_reg_sqrt_error))
print("r2 score: %2f" % (xg_reg_score))

Mean absolute error: 232.373495
Mean squared error: 1614.796137
r2 score: 0.998417


GRADIENT BOOSTINB REGRESSOR 

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
grad_boost = GradientBoostingRegressor()
grad_boost.fit(X_train, y_train.values)
y_pred = grad_boost.predict(X_test)
# r2_score(y_test, y_pred)

In [16]:
grad_boost_ab_error = mean_absolute_error(y_test, y_pred)
grad_boost_sqrt_error = np.sqrt(mean_squared_error(y_test, y_pred))
grad_boost_score = r2_score(y_test, y_pred)
print("Mean absolute error: %2f" % (grad_boost_ab_error))
print("Mean squared error: %2f" % (grad_boost_sqrt_error))
print("r2 score: %2f" % (grad_boost_score))

Mean absolute error: 7346.468705
Mean squared error: 11829.423658
r2 score: 0.915054


EXPLORING TO PICKLE FILE

In [17]:
X = np.array([["United States of America", 'Master’s degree', 15 , "Man" , "Full Stack Developer" , "Small" , "11 - 17 years" , "Books" , "5" , "HTML/CSS" , "Microsoft SQL Server" , "AWS" , "Windows"]])

X[:, 0] = le_country.transform(X[:,0])
X[:, 1] = le_education.transform(X[:,1])
X[:, 3] = le_gender.transform(X[:,3])
X[:, 4] = le_devtype.transform(X[:,4])
X[:, 5] = le_orgsize.transform(X[:,5])
X[:, 6] = le_Age1stCode.transform(X[:,6])
X[:, 7] = le_LearnCode.transform(X[:,7])
# X[:, 8] = le_YearsCode.transform(X[:,8])
X[:, 9] = le_LanguageHaveWorkedWith.transform(X[:,9])
X[:, 10] = le_DatabaseHaveWorkedWith.transform(X[:,10])
X[:, 11] = le_PlatformHaveWorkedWith.transform(X[:,11])
X[:, 12] = le_OpSys.transform(X[:,12])
X = X.astype(float)
# y_pred = regressor.predict(X)
import pickle
data = {"model": regressor, "le_country": le_country, "le_education": le_education , "le_gender": le_gender , "le_devtype": le_devtype , "le_orgsize" : le_orgsize , "le_Age1stCode":le_Age1stCode , "le_LearnCode" : le_LearnCode , "le_YearsCode":le_YearsCode , 'le_LanguageHaveWorkedWith' : le_LanguageHaveWorkedWith ,
"le_DatabaseHaveWorkedWith":le_DatabaseHaveWorkedWith ,
"le_PlatformHaveWorkedWith" : le_PlatformHaveWorkedWith ,
"le_OpSys":le_OpSys
        }
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [18]:
import numpy as np
import joblib

# Sample input
X = np.array([[
    "United States of America", 'Master’s degree', 15, "Man",
    "Full Stack Developer", "Small", "11 - 17 years", "Books", "5",
    "HTML/CSS", "Microsoft SQL Server", "AWS", "Windows"
]])

# Encode the inputs
X[:, 0] = le_country.transform(X[:, 0])
X[:, 1] = le_education.transform(X[:, 1])
X[:, 3] = le_gender.transform(X[:, 3])
X[:, 4] = le_devtype.transform(X[:, 4])
X[:, 5] = le_orgsize.transform(X[:, 5])
X[:, 6] = le_Age1stCode.transform(X[:, 6])
X[:, 7] = le_LearnCode.transform(X[:, 7])
# X[:, 8] = le_YearsCode.transform(X[:, 8])  # Skipped as in your original code
X[:, 9] = le_LanguageHaveWorkedWith.transform(X[:, 9])
X[:, 10] = le_DatabaseHaveWorkedWith.transform(X[:, 10])
X[:, 11] = le_PlatformHaveWorkedWith.transform(X[:, 11])
X[:, 12] = le_OpSys.transform(X[:, 12])

X = X.astype(float)

# Save the model and encoders using joblib
data = {
    "model": regressor,
    "le_country": le_country,
    "le_education": le_education,
    "le_gender": le_gender,
    "le_devtype": le_devtype,
    "le_orgsize": le_orgsize,
    "le_Age1stCode": le_Age1stCode,
    "le_LearnCode": le_LearnCode,
    "le_YearsCode": le_YearsCode,
    "le_LanguageHaveWorkedWith": le_LanguageHaveWorkedWith,
    "le_DatabaseHaveWorkedWith": le_DatabaseHaveWorkedWith,
    "le_PlatformHaveWorkedWith": le_PlatformHaveWorkedWith,
    "le_OpSys": le_OpSys
}

joblib.dump(data, 'saved_steps.joblib')


['saved_steps.joblib']