In [81]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
import pickle

# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [82]:
#READ THE DATA FILE
df1 = pd.read_csv("Datasets/survey_results_public.csv")

In [83]:
#ADD COLUMNS HERE

#Employment Column Additions
column_name = "Employment"
st = set()
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        st.add(ele)
for ele in st:
    df1[ele] = 0
    
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        df1.loc[i,ele] = 1

#Countries filtered out if not India
df1 = df1[df1["Industry"] == "Information Services, IT, Software Development, or other Technology"]
#Industries filtered out if not Information Services, IT, Software Development, or other Technology
df1 = df1[df1["Country"] == "India"]

#Number of Languages known
column_name = "LanguageHaveWorkedWith"
def languagecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfLanguagesKnown"] = df1.apply(languagecount,axis = 1)

# Number of Databases known
column_name = "DatabaseHaveWorkedWith"
def languagecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfDatabasesKnown"] = df1.apply(languagecount,axis = 1)

#Number of Languages known
column_name = "PlatformHaveWorkedWith"
def platformcount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfPlatformsKnown"] = df1.apply(platformcount,axis = 1)

#Number of Languages known
column_name = "WebframeHaveWorkedWith"
def webframecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfWebframesKnown"] = df1.apply(webframecount,axis = 1)

#Number of Languages known
column_name = "MiscTechHaveWorkedWith"
def misctechcount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfMiscTechsKnown"] = df1.apply(misctechcount,axis = 1)

#Number of Operating Systems familiar with
column_name = "OpSysPersonal use"
def opsyscount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfOpSys"] = df1.apply(opsyscount,axis = 1)

#Number of Sources from which Coding was learnt
column_name = "LearnCode"
def learncodecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfLearningSources"] = df1.apply(learncodecount,axis = 1)

# df1 = df1[df1["Currency"] == "INR\tIndian rupee"]

In [84]:
columns_selected =  [
   'Age',
   'AISelect',
   'OrgSize',
   'DevType',
   'YearsCode',
   'WorkExp', 
   'YearsCodePro',
   "RemoteWork",
   'Currency',
   "EdLevel",
   "ConvertedCompYearly",
   "NumberOfDatabasesKnown",
   "NumberOfLanguagesKnown",
   # "NumberOfPlatformsKnown",
   # "NumberOfWebframesKnown",
   # "NumberOfMiscTechsKnown",
   # "NumberOfOpSys",
   "NumberOfLearningSources"
]

train_columns = [
   'Age',
   'AISelect',
   'OrgSize',
   'DevType',
   "RemoteWork",
   'Currency',
   "EdLevel",
   "ExperienceCategory",
   "YearsCodeCategory",
   "YearsCodeProCategory",
   "NumberOfDatabasesKnown",
   "NumberOfLanguagesKnown",
   # "NumberOfPlatformsKnown",
   # "NumberOfWebframesKnown",
   # "NumberOfMiscTechsKnown",
   # "NumberOfOpSys",
   "NumberOfLearningSources"
]

df1 = df1[columns_selected]

df1 = df1.dropna()

In [85]:
#CATEGORISE COLUMNS  INTO MAJORITY VALUES AND 'OTHER'
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map


currency_map = shorten_categories(df1.Currency.value_counts(), 400)
df1['Currency'] = df1['Currency'].map(currency_map)

In [86]:
#CATEGORISE THE WORK EXPERIENCE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1['ExperienceCategory'] = pd.cut(df1['WorkExp'], bins=bins, labels=labels)

In [87]:
#CATEGORISE LESS THAN 1 YEAR AS 0 AND MORE THAN 50 AS 51 FOR YEARS OF CODE
df1['YearsCode'] = df1['YearsCode'].replace("Less than 1 year", 0)
df1['YearsCode'] = df1['YearsCode'].replace("More than 50 years", 51)

df1['YearsCodePro'] = df1['YearsCodePro'].replace("Less than 1 year", 0)
df1['YearsCodePro'] = df1['YearsCodePro'].replace("More than 50 years", 51)

In [88]:
#CATEGORISE YEARS OF CODE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1["YearsCode"] = df1["YearsCode"].astype(int)
df1["YearsCodePro"] = df1["YearsCodePro"].astype(int)
df1['YearsCodeCategory'] = pd.cut(df1['YearsCode'], bins=bins, labels=labels)
df1['YearsCodeProCategory'] = pd.cut(df1['YearsCodePro'], bins=bins, labels=labels)

In [89]:
#LABEL ENCODE THE COLUMNS
label_encoders = {}
df_LE = df1.copy()
df_LE = df_LE.dropna()

for i in train_columns:
    if i == "ConvertedCompYearly":
        continue
        
    label_encoders[i] = LabelEncoder()
    df_LE[i] = label_encoders[i].fit_transform(df_LE[i])
    
    
X = df_LE[train_columns]

Y = df_LE["ConvertedCompYearly"]

In [90]:
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [91]:
classifiers = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=1.0),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Neural Network": MLPRegressor(),
    "Gaussian Process": GaussianProcessRegressor(),
}

In [92]:
#CHECK IMPORTANCE
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, Y)
feature_importances = model.feature_importances_

map_ = {}

for i, a in enumerate(X):
    map_[a] = feature_importances[i]
    
sorted(map_.items(), key = lambda x: x[1])

[('EdLevel', 0.027933978481814108),
 ('Currency', 0.03151135763070278),
 ('RemoteWork', 0.03352575631283625),
 ('Age', 0.03903558581202177),
 ('YearsCodeProCategory', 0.04924458092925605),
 ('AISelect', 0.05470236035142072),
 ('NumberOfDatabasesKnown', 0.07575271658463671),
 ('NumberOfLearningSources', 0.07662352199892539),
 ('YearsCodeCategory', 0.0805929843020911),
 ('OrgSize', 0.10229964251405518),
 ('DevType', 0.11445550620297819),
 ('ExperienceCategory', 0.14850525947982293),
 ('NumberOfLanguagesKnown', 0.16581674939943877)]

In [93]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    
    #SAVE THE TRAINED MODEL
    classifiers[key] = clf
    
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)

                    Model  Run Time (minutes)           MAE           MSE  \
2                   Lasso                0.00   9358.227263  1.580163e+08   
0       Linear Regression                0.00   9357.984769  1.580255e+08   
1                   Ridge                0.00   9359.929522  1.580290e+08   
5       Gradient Boosting                0.00  10734.998613  2.097617e+08   
4           Random Forest                0.01  11255.854940  2.528811e+08   
6  Support Vector Machine                0.00  12751.953567  2.677026e+08   
9        Gaussian Process                0.00  13490.502422  3.842854e+08   
7     K-Nearest Neighbors                0.00  13362.301205  3.938724e+08   
8          Neural Network                0.01  19880.853714  6.579875e+08   
3           Decision Tree                0.00  19534.385542  1.540327e+09   

           RMSE        R2  
2  12570.453953  0.409733  
0  12570.817783  0.409699  
1  12570.957300  0.409686  
5  14483.152259  0.216439  
4  15902.235



In [95]:
#CREATE A PICKLE FILE
dictionary = {}

dictionary["LABEL_ENCODERS"] = label_encoders

dictionary["MODEL"] = classifiers["Linear Regression"]


with open("model_3.pkl", "wb") as file:
    pickle.dump(dictionary, file)
