In [38]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [16]:
columns = [
    "EdLevel",
    "Country",
    "AIBen",
    "WorkExp",
    "PurchaseInfluence",
    "YearsCode",
    "YearsCodePro",
    "Industry",
    "DevType",
    "OrgSize",
    "RemoteWork",
    "ConvertedCompYearly"
]

In [17]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [18]:
df1 = pd.read_csv("Datasets/survey_results_public.csv")

df1 = df1[df1["ConvertedCompYearly"].notnull()]
df1 = df1[columns]
df1 = df1.dropna()
country_map = shorten_categories(df1.Country.value_counts(), 400)
df1['Country'] = df1['Country'].map(country_map)
# currency_map = shorten_categories(df1.Currency.value_counts(), 400)
# df1["Currency"] = df1["Currency"].map(currency_map)

In [79]:
max(df1["ConvertedCompYearly"])

74351432.0

In [19]:
df1 = df1[df1["ConvertedCompYearly"] <= 250000]
df1 = df1[df1["ConvertedCompYearly"] >= 100]
df1 = df1[df1['Country'] != 'Other']

In [8]:
df1.to_csv("Filtered Dataset.csv")

In [20]:
df_LE = df1
for i in columns:
    if i == "ConvertedCompYearly":
        continue
    le = LabelEncoder()
    df_LE[i] = le.fit_transform(df_LE[i])

In [31]:
X = df_LE[[
    "EdLevel",
    "Country",
    "WorkExp",
    "YearsCode",
    "YearsCodePro",
    "DevType",
    "OrgSize"
]]

Y = df_LE["ConvertedCompYearly"]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
X_train_scaled = X_train
X_test_scaled = X_test

In [35]:
classifiers = {}

# Linear Regression
classifiers["LinearRegression"] = LinearRegression()

# Decision Tree
classifiers["DecisionTree"] = DecisionTreeRegressor()

# Random Forest
classifiers["RandomForest"] = RandomForestRegressor()

# XGBoost
classifiers["XGBoost"] = XGBRegressor()

# # LightGBM
# classifiers["LightGBM"] = LGBMRegressor()

# # CatBoost
# classifiers["CatBoost"] = CatBoostRegressor(silent=True)

In [36]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)


              Model  Run Time (minutes)           MAE           MSE  \
3           XGBoost                0.00  26985.205298  1.361976e+09   
2      RandomForest                0.03  27259.742823  1.376934e+09   
0  LinearRegression                0.00  36204.631622  2.100329e+09   
1      DecisionTree                0.00  36841.959942  2.646121e+09   

           RMSE        R2  
3  36904.963371  0.589967  
2  37107.063535  0.585464  
0  45829.342248  0.367681  
1  51440.464748  0.203366  


In [18]:
df_LE["ConvertedCompYearly"].value_counts()

ConvertedCompYearly
2161    212
1871    167
3501    163
2416    155
2635    152
       ... 
672       1
3726      1
1401      1
3698      1
858       1
Name: count, Length: 3755, dtype: int64

In [37]:
max(df_LE["ConvertedCompYearly"])

250000.0

In [39]:
#CHECK IMPORTANCE
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, Y)
feature_importances = model.feature_importances_

map_ = {}

for i, a in enumerate(X):
    map_[a] = feature_importances[i]
    
sorted(map_.items(), key = lambda x: x[1])

[('EdLevel', 0.0433210615599559),
 ('YearsCodePro', 0.06852856958122609),
 ('OrgSize', 0.07044839677350732),
 ('DevType', 0.07751551867310329),
 ('YearsCode', 0.09431940568586594),
 ('WorkExp', 0.17765718199858985),
 ('Country', 0.4682098657277517)]