In [2]:
from lussi.stackoverflow import *
from lussi.ziprecruiter import *

"""
Run like this: 
(.venv) hurricane:krijudato afraser$ python ./src/lussi/run.py
"""

# run this to build out your caches.
nogit_data_dir = "622data_nogit"
#build_stack(data_dir=nogit_data_dir)
#build_zip(data_dir=nogit_data_dir)

# this is how you load.
raw_stack  = load_stack(data_dir = nogit_data_dir, stack_type=StackType.RAW)
wide_stack = load_stack(data_dir = nogit_data_dir, stack_type=StackType.WIDE)
ziprecruiter = load_zip(data_dir = nogit_data_dir)

#print(raw_stack.head())
#print(wide_stack.head())
#print(ziprecruiter.head())

In [26]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# encoding / preprocessing
# filter for US only
us_data = wide_stack[wide_stack['Country'] == 'United States']

# filter for only 2022
us_data = us_data[us_data['Year'] == 2022]

# Function to categorize gender based on the provided rules
def group_gender(gender):
    gender = str(gender).lower()  # Convert to lowercase 
    if 'female' in gender or 'woman' in gender:
        return 'Female'
    elif 'male' in gender or 'man' in gender:
        return 'Male'
    else:
        return 'Other'

# Apply the function to the 'Gender' column to create 'gender_grouped'
us_data.loc[:, 'gender_grouped'] = us_data['Gender'].apply(group_gender)

# columns to drop 
cols_to_drop = ['Year', 'OrgSize', 'Country', 'Employment', 'Gender', 'EdLevel',
       'US_State', 'Age', 'DevType', 'Sexuality', 'Ethnicity',
       'DatabaseWorkedWith', 'LanguageWorkedWith', 'PlatformWorkedWith',
       'YearsCodePro','python', 'sql', 'java', 'javascript', 'ruby', 'php', 'c',
       'swift', 'scala', 'r', 'rust', 'julia',  'microsoftsqlserver',
       'mongodb', 'postgresql', 'oracle', 'ibmdb2', 'redis', 'sqlite',
       'mariadb', 'microsoftazure', 'googlecloud', 'ibmcloudorwatson',
       'kubernetes', 'linux', 'windows']

# new dataframe with columns dropped
df = us_data.drop(columns=cols_to_drop)

df_new=df

In [40]:
# XGboost

# Convert categorical columns to 'category' dtype
#categorical_cols = ['mysql', 'aws', 'sexuality_grouped', 'ethnicity_grouped', 'gender_grouped']  # Add any other categorical columns you have
#for col in categorical_cols:
#    df_new[col] = df_new[col].astype('category')

# define variables - select all variables except the target variable
X = df_new.drop(columns=['AnnualSalary'])

# define target variable
Y = df_new['AnnualSalary']

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Identify all boolean columns
bool_cols = X_encoded.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers (True -> 1, False -> 0)
X_encoded[bool_cols] = X_encoded[bool_cols].astype(int)

# Split the dataset into 80% training and 20% testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y, test_size=0.2, random_state=42)

# Impute missing values with the mean 
Y_train_imputed = Y_train.fillna(Y_train.mean()) 

# Impute numeric columns using 'mean' strategy
numeric_cols = ['YearsCodeProAvg', 'OrgSizeAvg', 'AgeAvg']
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

# Applied StandardScalar to numerical features
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Select the columns to scale
columns_to_scale = ['YearsCodeProAvg', 'OrgSizeAvg', 'AgeAvg']

# Fit and transform the scaler
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

In [41]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error as MSE
# Instantiation 
xgb_r = xgb.XGBRegressor(objective='reg:squarederror', 
                         n_estimators=100, 
                         learning_rate=0.1, 
                         max_depth=6, 
                         enable_categorical=True,  # Enable categorical feature handling
                         seed=123) 
  
# Fitting the model 
xgb_r.fit(X_train, Y_train_imputed) 
  
# Predict the model 
pred = xgb_r.predict(X_test) 

# Assuming Y_test is a pandas Series
Y_test_imputed = Y_test.copy()  # Create a copy to avoid modifying the original
Y_test_imputed.fillna(Y_test.mean(), inplace=True)  # Mean imputation

# compute RMSE
rmse = np.sqrt(mean_squared_error(Y_test_imputed, pred)) 
print("RMSE : % f" % (rmse))

RMSE :  938992.816527


In [42]:
# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': [0.001, 0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'min_child_weight': [1, 5, 10]
}

# Instantiate the model and RandomizedSearchCV
xgb_r = xgb.XGBRegressor(objective='reg:squarederror', seed=123)
random_search = RandomizedSearchCV(estimator=xgb_r, param_distributions=param_grid, n_iter=50, scoring='neg_mean_squared_error', cv=5, verbose=1, random_state=42)

# Fit the random search model
random_search.fit(X_train, Y_train_imputed)
print(f"Best parameters: {random_search.best_params_}")

# Predict using the best model
best_model = random_search.best_estimator_
pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(Y_test_imputed, pred))
print("Optimized RMSE: % f" % (rmse))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 0.001, 'max_depth': 6, 'min_child_weight': 10, 'n_estimators': 238, 'subsample': 1.0}
Optimized RMSE:  900341.084831


In [44]:
from sklearn.metrics import r2_score

# Predict the model on the test data
pred = random_search.predict(X_test)

# Compute the R² score
r2 = r2_score(Y_test_imputed, pred)
print("R²: %f" % r2)

R²: -0.001739
