In [1]:
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import json
import pandas as pd
import numpy as np
import os
import sys
import pickle
import scipy.stats as stats 
import matplotlib.pyplot as plt

# Using the new data cleaning code
First, we have to go from the current directory (inside `old-model-code/experiments/`) back to the base directory.

In [2]:
os.chdir('../../')


In [3]:
os.getcwd()

'/Users/xehu/Desktop/Team Process Mapping/tpm-horse-race-modeling'

In [4]:
stageId_task =  "./data_cache/multi_task_stageId_task_cumulative.pkl"
stageId_cumulative =  "./data_cache/multi_task_stageId_cumulative.pkl"
stageId_noncumulative =  "./data_cache/multi_task_stageId_cumulative.pkl"
roundId_task =  "./data_cache/multi_task_roundId_task.pkl"
roundId_cumulative =  "./data_cache/multi_task_roundId_cumulative.pkl"

DATASETFILEPATH = stageId_noncumulative

In [5]:
with open(DATASETFILEPATH, "rb") as horseracedataset_file:
    HorseRaceData = pickle.load(horseracedataset_file)

### Key attributes of the HorseRaceDataSet
- HorseRaceData.dvs
- HorseRaceData.composition_features
- HorseRaceData.size_feature
- HorseRaceData.task_features
- HorseRaceData.task_complexity_features
- HorseRaceData.conversation_features

Here's what I wanted before:

`team_composition_features`, `task_features`, `conv_features`, `targets`

In [6]:
targets = HorseRaceData.dvs
team_composition_features = pd.concat([HorseRaceData.size_feature, HorseRaceData.composition_features], axis = 1)
task_features = pd.concat([HorseRaceData.task_features, HorseRaceData.task_complexity_features], axis = 1)
conv_features = HorseRaceData.conversation_features

# Old Cleaning code (for comparison)

In [7]:
def drop_invariant_columns(df):
    """
    Certain features are invariant throughout the training data (e.g., the entire column is 0 throughout).

    These feature obviously won't be very useful predictors, so we drop them.
    
    This function works by identifying columns that only have 1 unique value throughout the entire column,
    and then dropping them.

    @df: the dataframe containing the features (this should be X).
    """
    nunique = df.nunique()
    cols_to_drop = nunique[nunique == 1].index
    return(df.drop(cols_to_drop, axis=1))

In [8]:
def read_and_preprocess_data(path, min_num_chats):
    conv_data  = pd.read_csv(path)

    # Filter this down to teams that have at least min_num of chats
    # Can also comment this out to re-run results on *all* conversations!
    conv_data = conv_data[conv_data["sum_num_messages"] >= min_num_chats]

    # Save the important information

    # DV
    dvs = conv_data[["score","speed","efficiency","raw_duration_min","default_duration_min"]]

    # Team Composition
    composition_colnames = ['birth_year', 'CRT', 'income_max', 'income_min', 'IRCS_GS', 'IRCS_GV', 'IRCS_IB', 'IRCS_IR',
                'IRCS_IV', 'IRCS_RS', 'political_fiscal', 'political_social', 'RME', 'country', 'education_level',
                'gender', 'marital_status', 'political_party', 'race', 'playerCount']
    
    # Select columns that contain the specified keywords
    composition = conv_data[[col for col in conv_data.columns if any(keyword in col for keyword in composition_colnames)]]

    # Task
    task = conv_data[['task', 'complexity']].copy()

    task_map_path = './features/task-mapping/task_map.csv' # get task map
    task_map = pd.read_csv(task_map_path)

    task_name_mapping = {
        "Moral Reasoning": "Moral Reasoning (Disciplinary Action Case)",
        "Wolf Goat Cabbage": "Wolf, goat and cabbage transfer",
        "Guess the Correlation": "Guessing the correlation",
        "Writing Story": "Writing story",
        "Room Assignment": "Room assignment task",
        "Allocating Resources": "Allocating resources to programs",
        "Divergent Association": "Divergent Association Task",
        "Word Construction": "Word construction from a subset of letters",
        "Whac a Mole": "Whac-A-Mole"
    }
    task.loc[:, 'task'] = task['task'].replace(task_name_mapping)
    task = pd.merge(left=task, right=task_map, on = "task", how='left')
    
    # Create dummy columns for 'complexity'
    complexity_dummies = pd.get_dummies(task['complexity'])
    task = pd.concat([task, complexity_dummies], axis=1)   
    task.drop(['complexity', 'task'], axis=1, inplace=True)

    # Conversation
    conversation = conv_data.drop(columns=list(dvs.columns) + list(composition.columns) + ['task', 'complexity', 'roundId', 'gameId', 'message', 'speaker_nickname', 'conversation_num', 'timestamp'])
    conversation = drop_invariant_columns(conversation) # drop invariant conv features

    return composition, task, conversation, dvs

In [9]:
multitask_cumulative_by_stage = 'conv/multi_task_stageId_cumulative_conv.csv'
multitask_cumulative_by_stage_and_task = 'conv/multi_task_stageId_task_cumulative_conv.csv'

In [10]:
# PARAMETERS
desired_target = "score"
data_path = "./data_cache/raw_output/"
min_num_chats = 0


In [11]:
team_composition_features_old, task_features_old, conv_features_old, targets_old = read_and_preprocess_data(data_path + multitask_cumulative_by_stage_and_task, min_num_chats=min_num_chats)

# Number of points in dataset
len(conv_features)

1043

## Compare outputs across different data analysis schemas

In [12]:
targets

Unnamed: 0,score,speed,efficiency,raw_duration_min,default_duration_min,task
0,85.040573,0.000556,0.047245,3.018050,3.0,Divergent Association
1,90.113825,0.000556,0.050063,3.016283,3.0,Divergent Association
2,60.000000,0.000333,0.020000,5.017700,5.0,Sudoku
3,49.000000,0.000333,0.016333,5.019833,5.0,Sudoku
4,100.000000,11.074630,1107.462975,4.446283,5.0,Sudoku
...,...,...,...,...,...,...
1038,100.000000,0.000333,0.033333,5.017133,5.0,Wolf Goat Cabbage
1039,90.000000,0.000333,0.030000,5.041683,5.0,Moral Reasoning
1040,90.000000,0.000333,0.030000,5.033550,5.0,Moral Reasoning
1041,0.000000,0.000333,0.000000,5.021850,5.0,Wolf Goat Cabbage


In [13]:
targets_old

Unnamed: 0,score,speed,efficiency,raw_duration_min,default_duration_min
0,85.040573,0.000556,0.047245,3.018050,3
1,90.113825,0.000556,0.050063,3.016283,3
2,60.000000,0.000333,0.020000,5.017700,5
3,49.000000,0.000333,0.016333,5.019833,5
4,100.000000,11.074630,1107.462975,4.446283,5
...,...,...,...,...,...
1038,100.000000,0.000333,0.033333,5.017133,5
1039,90.000000,0.000333,0.030000,5.041683,5
1040,90.000000,0.000333,0.030000,5.033550,5
1041,0.000000,0.000333,0.000000,5.021850,5


## Multi-Task Modeling Playground (our old modeling code)

In [14]:
len(conv_features)

1043

# Set up X's and y's

In [15]:
X_train = pd.concat([team_composition_features, task_features, conv_features], axis = 1)
y_train = targets

In [16]:
def columns_with_na(df):
    """
    Check and return columns that contain NaN (NA) values in a DataFrame.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - List of column names with NaN values
    """
    # Check for NaN values in each column
    na_columns = df.columns[df.isna().any()].tolist()

    return na_columns

# Check columns that have NA
result = columns_with_na(X_train)
for colname in result:
    print(colname)

In [17]:
X_train

Unnamed: 0,playerCount,birth_year_mean,birth_year_std,CRT_mean,CRT_std,income_max_mean,income_max_std,income_min_mean,income_min_std,IRCS_GS_mean,...,Q5creativity_input_1,Q25_type6_mixed_motive,High,Low,Medium,PC1,PC2,PC3,PC4,PC5
0,3.0,0.225526,-0.252076,0.905383,-0.539541,1.116196,-1.209939,0.592396,-1.093847,-0.624234,...,1.138783,0,False,False,True,-0.961046,0.208971,0.360360,-0.242026,0.352777
1,3.0,0.225526,-0.252076,0.905383,-0.539541,1.116196,-1.209939,0.592396,-1.093847,-0.624234,...,1.138783,0,False,True,False,-0.828388,0.012341,0.374958,-0.192664,0.256179
2,3.0,0.225526,-0.252076,0.905383,-0.539541,1.116196,-1.209939,0.592396,-1.093847,-0.624234,...,-1.149402,0,False,False,True,-0.698972,-0.123808,0.476967,-0.239819,0.012060
3,3.0,0.225526,-0.252076,0.905383,-0.539541,1.116196,-1.209939,0.592396,-1.093847,-0.624234,...,-1.149402,0,True,False,False,-0.600780,-0.300674,0.498704,-0.330729,0.082295
4,3.0,0.225526,-0.252076,0.905383,-0.539541,1.116196,-1.209939,0.592396,-1.093847,-0.624234,...,-1.149402,0,False,True,False,-0.655757,-0.708767,-0.249975,-0.048535,0.295726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,6.0,0.255332,-0.211817,0.425772,-0.957431,-1.273803,-0.611657,-1.267872,-0.907071,-0.799831,...,0.642659,0,False,False,True,1.533249,0.365999,0.381123,0.532699,-1.255629
1039,6.0,0.255332,-0.211817,0.425772,-0.957431,-1.273803,-0.611657,-1.267872,-0.907071,-0.799831,...,-0.267563,0,False,False,True,0.172088,2.133682,-0.146383,-0.378335,-2.064942
1040,6.0,0.255332,-0.211817,0.425772,-0.957431,-1.273803,-0.611657,-1.267872,-0.907071,-0.799831,...,-0.267563,0,True,False,False,-0.020101,1.985832,-0.313073,-0.568680,-1.877038
1041,3.0,0.113116,-0.246717,-2.321092,-1.457852,-1.970087,-1.160550,-1.788747,-1.039401,-0.624234,...,0.642659,0,False,True,False,-1.437187,-0.087584,2.433398,1.665295,0.348474


## Try LASSO/Ridge Regression, one Set of Features at a Time

Here, we want to implement *leave-one-out cross-validation*, and use Q^2 as our metric.



Two updates to make here:

1. For nested LASSO/Ridge models, add the ability to initialize the model using the previous weights
2. Visualize importance using another library, like SHAP

In [18]:
# Note --- this uses k-fold cross-validation with k = 5 (the default)
# We are testing 10,000 different alphas, so I feel like this is an OK heuristic
def get_optimal_alpha(y_target, feature_columns_list, lasso):

    if(lasso == True):
        model = LassoCV(n_alphas = 10000)
        model.fit(X_train[feature_columns_list], y_train[y_target])
    else:
        model = RidgeCV(n_alphas = 10000)
        model.fit(X_train[feature_columns_list], y_train[y_target])
        
    return model.alpha_ # optimal alpha

In [19]:
def fit_regularized_linear_model(y_target, feature_columns_list, lasso=True, tune_alpha=False, prev_coefs = None, prev_alpha = None):

    if not tune_alpha:
        alpha = 1.0
    if (prev_alpha is not None):
        alpha = prev_alpha # use previous alpha
        print("Setting alpha to previous...")
        print(alpha)
    else:
        # Hyperparameter tune the alpha
        alpha = get_optimal_alpha(y_target, feature_columns_list, lasso=True)

    if lasso:
        model = Lasso(alpha=alpha)
    else:
        model = Ridge(alpha=alpha)

    if(prev_coefs is not None): # set weights to previous coefficients
        print("Setting coefficients ....")
        model.coef_ = prev_coefs

        print(model.coef_)

    # Calculation of Q^2 metric
    squared_model_prediction_errors = []
    squared_average_prediction_errors = []

    # Initialize a list to store coefficients
    coefficients_list = []

    # Leave one out -- iterate through the entire length of the dataset
    for i in range(len(y_train)):
        # Store the evaluation datapoint
        evaluation_X = X_train.iloc[[i]]
        evaluation_y = y_train.iloc[[i]][y_target]

        # Drop the ith datapoint (leave this one out)
        X_train_fold = X_train.drop(X_train.index[i])
        y_train_fold = y_train.drop(y_train.index[i])[y_target]

        # Fit the model
        model.fit(X_train_fold[feature_columns_list], y_train_fold)

        # Save the Prediction Error
        prediction = model.predict(evaluation_X[feature_columns_list])[0]
        squared_model_prediction_errors.append((evaluation_y - prediction) ** 2)

        # Save the Total Error for this fold
        squared_average_prediction_errors.append((evaluation_y - np.mean(y_train_fold)) ** 2)

        # Append the coefficients to the list
        coefficients_list.append(model.coef_)

    # Create a DataFrame with feature names as rows and iteration results as columns
    feature_coefficients = pd.DataFrame(coefficients_list, columns=feature_columns_list).T

    q_squared = 1 - (np.sum(squared_model_prediction_errors) / np.sum(squared_average_prediction_errors))
    print("Q^2: " + str(q_squared))

    return model, q_squared, feature_coefficients


In [20]:
def display_feature_coefficients(feature_coef_df):
    # Initialize a list to store DataFrames for each feature
    dfs = []

    # Iterate through the rows of the input DataFrame
    for feature_name, coefficients in feature_coef_df.iterrows():
        # Calculate the confidence interval without NaN values
        non_nan_coefficients = coefficients[~np.isnan(coefficients)]
        if len(non_nan_coefficients) == 0:
            # Handle the case where there are no valid coefficients
            continue

        mean_coef = non_nan_coefficients.mean()

        # Check if all coefficients in the row are the same
        if len(coefficients.unique()) == 1:
            # If all coefficients are the same, set the lower and upper CI to the mean
            confidence_interval = (mean_coef, mean_coef)
        else:
            std_error = non_nan_coefficients.sem()
            confidence_interval = stats.t.interval(0.95, len(non_nan_coefficients) - 1, loc=mean_coef, scale=std_error)

        # Create a DataFrame for the summary data
        temp_df = pd.DataFrame({
            "Feature": [feature_name],
            "Mean": [mean_coef],
            "Lower_CI": [confidence_interval[0]],
            "Upper_CI": [confidence_interval[1]]
        })

        # Append the temporary DataFrame to the list
        dfs.append(temp_df)

    # Concatenate all the DataFrames in the list into the final summary DataFrame
    summary_df = pd.concat(dfs, ignore_index=True)

    return summary_df

In [21]:
def sort_by_mean_abs(df):
    return df.reindex(df["Mean"].abs().sort_values(ascending=False).index)

In [22]:
# Go through the different types of features and fit models

# First, create a data structure that saves the result
result = {
    "model": [],
    "model_type": [],
    "features_included": [],
    "alpha": [],
    "q_squared": []
}

result_df = pd.DataFrame(result)

## Team composition features

In [23]:
len(team_composition_features.columns)

39

In [24]:
model_ridge_composition, mrc_q2, mrc_feature_coefficients = fit_regularized_linear_model(desired_target, team_composition_features.columns, lasso = False, tune_alpha = True)

result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_composition], "model_type": ["Ridge"], "features_included": ["Team Composition"], "alpha": [model_ridge_composition.alpha.round(4)], "q_squared": [mrc_q2]})], ignore_index=True)

Q^2: 0.012501432165575088


In [25]:
model_ridge_composition

In [26]:
sort_by_mean_abs(display_feature_coefficients(mrc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
13,IRCS_IB_mean,-5.002357,-5.004761,-4.999954
2,birth_year_std,-4.200602,-4.212892,-4.188312
4,CRT_std,3.826194,3.823701,3.828687
1,birth_year_mean,-3.740816,-3.75311,-3.728523
34,marital_status_std,-3.368221,-3.370849,-3.365594
32,gender_std,-3.336694,-3.339352,-3.334035
16,IRCS_IR_std,-3.213685,-3.216096,-3.211274
3,CRT_mean,3.170357,3.1674,3.173314
36,political_party_std,3.038671,3.036142,3.041201
33,marital_status_mean,2.856308,2.85363,2.858986


In [27]:
model_lasso_composition, mlc_q2, mlc_feature_coefficients = fit_regularized_linear_model(desired_target, team_composition_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_composition], "model_type": ["Lasso"], "features_included": ["Team Composition"], "alpha": [model_lasso_composition.alpha.round(4)], "q_squared": [mlc_q2]})], ignore_index=True)

Q^2: 0.0227099607115111


In [28]:
model_lasso_composition

## Task Features

In [29]:
len(task_features.columns)

27

In [30]:
model_ridge_task, mrt_q2, mrt_feature_coefficients = fit_regularized_linear_model(desired_target, task_features.columns, lasso = False, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_task], "model_type": ["Ridge"], "features_included": ["Task Complexity"], "alpha": [model_ridge_task.alpha.round(4)], "q_squared": [mrt_q2]})], ignore_index=True)

  model = cd_fast.enet_coordinate_descent(


Q^2: 0.2332122459455187


In [31]:
model_ridge_task

In [32]:
sort_by_mean_abs(display_feature_coefficients(mrt_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,9.169015,9.167739,9.170291
6,Q9divisible_unitary,-6.395587,-6.39636,-6.394813
3,Q6type_5_cc,-5.588585,-5.589805,-5.587365
4,Q7type_7_battle,5.466044,5.464889,5.467199
0,Q1concept_behav,-5.199246,-5.200182,-5.198309
7,Q10maximizing,-4.10996,-4.110667,-4.109253
25,Low,3.904917,3.902777,3.907057
19,Q24eureka_question,-3.891719,-3.893511,-3.889928
13,Q17within_sys_sol,-3.730406,-3.733422,-3.727389
9,Q13outcome_multip,3.218008,3.217208,3.218807


In [33]:
model_lasso_task, mlt_q2, mlt_feature_coefficients = fit_regularized_linear_model(desired_target, task_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_task], "model_type": ["Lasso"], "features_included": ["Task Complexity"], "alpha": [model_lasso_task.alpha.round(4)], "q_squared": [mlt_q2]})], ignore_index=True)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Q^2: 0.23320762030686215


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [34]:
model_lasso_task

In [35]:
sort_by_mean_abs(display_feature_coefficients(mlt_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
3,Q6type_5_cc,-11.084822,-11.087299,-11.082344
4,Q7type_7_battle,6.659162,6.657913,6.660412
22,Q5creativity_input_1,6.24186,6.240645,6.243076
6,Q9divisible_unitary,-5.998854,-6.000456,-5.997252
7,Q10maximizing,-5.316903,-5.31869,-5.315116
0,Q1concept_behav,-5.232213,-5.233914,-5.230511
13,Q17within_sys_sol,-5.176786,-5.180565,-5.173008
25,Low,4.669486,4.665916,4.673055
2,Q4type_2_generate,-4.310938,-4.312791,-4.309084
10,Q14sol_scheme_mul,-4.095135,-4.09741,-4.09286


## Task + Composition Together

In [36]:
# add together weights from previous models
previous_best_weights_ridge = np.array(list(model_ridge_composition.coef_) + list(model_ridge_task.coef_))

In [37]:
task_comp_features = list(task_features.columns) + list(team_composition_features.columns)

model_ridge_taskcomp, mrtc_q2, mrtc_feature_coefficients = fit_regularized_linear_model(desired_target, task_comp_features, lasso = False, tune_alpha = False, prev_coefs = previous_best_weights_ridge, prev_alpha = model_ridge_task.alpha)

Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[ 1.97612628 -3.34679316 -3.85649507  3.35483886  3.93756645  2.25554276
  0.98508357  1.61590423 -1.07702291 -1.24797171  0.68788246 -2.09280782
  1.83099612 -4.96979007 -1.92655483 -0.95925258 -3.22560212  1.96899662
 -2.07995661 -0.22941902  0.50715318  0.64538928  0.91875627 -2.31534681
 -1.99613634  2.68175966  2.08122684  2.03364932  2.0375148   1.63392163
  1.6637759  -0.39460436 -3.43634514  2.85453314 -3.47876443 -0.87186841
  3.13456044  1.7707545  -0.38237041 -5.19716006  0.31982687 -3.03258671
 -5.61086167  5.46202741  2.48336368 -6.39990334 -4.11683669  1.78685897
  3.21752918 -3.14212969  0.42679335 -1.41004935 -3.68421275  1.22102857
  1.54881358  0.40457994 -0.61849422 -0.23795796 -3.88214184  2.08660398
 -1.20966386  9.18466154  0.         -0.8184691   3.91994458 -3.10147548]
Q^2: 0.24162660162952287


In [38]:
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_taskcomp], "model_type": ["Ridge"], "features_included": ["Team Composition + Task Complexity"], "alpha": [model_ridge_taskcomp.alpha.round(4)], "q_squared": [mrtc_q2]})], ignore_index=True)

In [39]:
sort_by_mean_abs(display_feature_coefficients(mrtc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,8.897636,8.896262,8.899010
6,Q9divisible_unitary,-6.124611,-6.125487,-6.123734
0,Q1concept_behav,-5.338552,-5.339574,-5.337530
3,Q6type_5_cc,-5.278921,-5.280230,-5.277612
4,Q7type_7_battle,5.168645,5.167447,5.169842
...,...,...,...,...
11,Q15dec_verifiability,0.405000,0.404717,0.405284
48,political_fiscal_mean,0.371210,0.368233,0.374186
58,gender_mean,0.366554,0.364392,0.368716
65,race_std,-0.065316,-0.068050,-0.062583


In [40]:
# add together weights from previous models
previous_best_weights_lasso = np.array(list(model_lasso_composition.coef_) + list(model_lasso_task.coef_))

In [41]:
model_lasso_taskcomp, mltc_q2, mltc_feature_coefficients = fit_regularized_linear_model(desired_target, task_comp_features, lasso = True, tune_alpha = False, prev_coefs = previous_best_weights_lasso, prev_alpha = model_lasso_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_taskcomp], "model_type": ["Lasso"], "features_included": ["Team Composition + Task Complexity"], "alpha": [model_lasso_taskcomp.alpha.round(4)], "q_squared": [mltc_q2]})], ignore_index=True)

Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[  0.11445286   0.          -0.           0.70652909   0.
   1.1563749    0.           0.65177512   0.          -0.
  -0.          -0.           0.          -1.72300211   0.
  -0.          -0.88771252  -0.          -0.          -0.
   0.48702438  -0.           0.          -0.40772295  -0.
   0.99706012  -0.           0.          -0.           0.
  -0.          -0.           0.          -0.          -0.8394945
  -0.           0.           0.          -0.          -5.22326036
   1.0332459   -4.3182539  -11.13159331   6.6450664    2.24755445
  -6.00156552  -5.31939807   0.           0.2450264   -4.12356457
  -0.          -0.          -5.12330933   0.7833474    0.
   0.           0.01992893  -1.34966795  -3.16094839   0.
   0.           6.24656112   0.          -0.           4.71557961
  -2.26320153]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Q^2: 0.2429042423138863


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [42]:
sort_by_mean_abs(display_feature_coefficients(mltc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
3,Q6type_5_cc,-10.843675,-10.846181,-10.841168
4,Q7type_7_battle,6.555551,6.554250,6.556851
22,Q5creativity_input_1,6.068531,6.067138,6.069924
0,Q1concept_behav,-5.813881,-5.815772,-5.811990
6,Q9divisible_unitary,-5.569199,-5.570803,-5.567594
...,...,...,...,...
23,Q25_type6_mixed_motive,0.000000,0.000000,0.000000
12,Q16shared_knowledge,0.000000,0.000000,0.000000
11,Q15dec_verifiability,0.000000,0.000000,0.000000
24,High,0.000000,0.000000,0.000000


## Conversation Alone

In [43]:
model_lasso_comms, mlcom_q2, mlcom_feature_coefficients = fit_regularized_linear_model(desired_target, conv_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_comms], "model_type": ["Lasso"], "features_included": ["Communication"], "alpha": [model_lasso_comms.alpha.round(4)], "q_squared": [mlcom_q2]})], ignore_index=True)

Q^2: 0.00792983844332984


In [44]:
model_lasso_comms

In [45]:
sort_by_mean_abs(display_feature_coefficients(mlcom_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
1,PC2,2.303763,2.301907,2.305619
3,PC4,-1.880953,-1.88316,-1.878746
2,PC3,-1.822003,-1.824059,-1.819947
4,PC5,1.625281,1.623705,1.626857
0,PC1,1.100221,1.098668,1.101773


In [46]:
model_lasso_comms

In [47]:
model_ridge_comms, mrcom_q2, mrcom_feature_coefficients = fit_regularized_linear_model(desired_target, conv_features.columns, lasso = False, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_comms], "model_type": ["Ridge"], "features_included": ["Communication"], "alpha": [model_ridge_comms.alpha.round(4)], "q_squared": [mrcom_q2]})], ignore_index=True)

Q^2: 0.007931882827066583


In [48]:
sort_by_mean_abs(display_feature_coefficients(mrcom_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
1,PC2,2.306064,2.304208,2.307919
3,PC4,-1.883255,-1.885462,-1.881049
2,PC3,-1.824305,-1.826361,-1.822249
4,PC5,1.627584,1.626008,1.62916
0,PC1,1.102524,1.100972,1.104077


## Conversation Features + Task Features

In [49]:
task_lasso_weights = np.array(list(model_lasso_task.coef_) + list(np.zeros(len((conv_features.columns)))))

In [50]:
convtask_features = list(task_features.columns) + list(conv_features.columns)
model_lasso_tconv, mltconv_q2, mltconv_feature_coefficients = fit_regularized_linear_model(desired_target, convtask_features, lasso = True, tune_alpha = False, prev_coefs=task_lasso_weights, prev_alpha = model_lasso_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_tconv], "model_type": ["Lasso"], "features_included": ["Task Complexity + Communication"], "alpha": [model_lasso_tconv.alpha.round(4)], "q_squared": [mltconv_q2]})], ignore_index=True)

Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[ -5.22326036   1.0332459   -4.3182539  -11.13159331   6.6450664
   2.24755445  -6.00156552  -5.31939807   0.           0.2450264
  -4.12356457  -0.          -0.          -5.12330933   0.7833474
   0.           0.           0.01992893  -1.34966795  -3.16094839
   0.           0.           6.24656112   0.          -0.
   4.71557961  -2.26320153   0.           0.           0.
   0.           0.        ]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Q^2: 0.23799445805320363


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [51]:
sort_by_mean_abs(display_feature_coefficients(mltconv_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
3,Q6type_5_cc,-10.869246,-10.87172,-10.866772
4,Q7type_7_battle,6.595982,6.594855,6.59711
6,Q9divisible_unitary,-6.463215,-6.464888,-6.461543
13,Q17within_sys_sol,-5.867041,-5.870931,-5.863151
22,Q5creativity_input_1,5.740514,5.739216,5.741812
0,Q1concept_behav,-5.456787,-5.458578,-5.454996
7,Q10maximizing,-4.834118,-4.835919,-4.832316
25,Low,4.752197,4.748618,4.755776
2,Q4type_2_generate,-3.415591,-3.417642,-3.41354
10,Q14sol_scheme_mul,-3.397501,-3.399806,-3.395195


In [52]:
task_ridge_weights = np.array(list(model_ridge_task.coef_) + list(np.zeros(len((conv_features.columns)))))

In [53]:
model_ridge_tconv, mrtconv_q2, mrtconv_feature_coefficients = fit_regularized_linear_model(desired_target, convtask_features, lasso = False, tune_alpha = False, prev_coefs=task_ridge_weights, prev_alpha = model_ridge_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_tconv], "model_type": ["Ridge"], "features_included": ["Task Complexity + Communication"], "alpha": [model_ridge_tconv.alpha.round(4)], "q_squared": [mrtconv_q2]})], ignore_index=True)

Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[-5.19716006  0.31982687 -3.03258671 -5.61086167  5.46202741  2.48336368
 -6.39990334 -4.11683669  1.78685897  3.21752918 -3.14212969  0.42679335
 -1.41004935 -3.68421275  1.22102857  1.54881358  0.40457994 -0.61849422
 -0.23795796 -3.88214184  2.08660398 -1.20966386  9.18466154  0.
 -0.8184691   3.91994458 -3.10147548  0.          0.          0.
  0.          0.        ]


Q^2: 0.2379944798676331


In [54]:
sort_by_mean_abs(display_feature_coefficients(mrtconv_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,8.766695,8.76537,8.768019
6,Q9divisible_unitary,-6.492722,-6.493504,-6.491939
4,Q7type_7_battle,5.402442,5.401266,5.403619
3,Q6type_5_cc,-5.381199,-5.382415,-5.379984
0,Q1concept_behav,-5.023426,-5.024418,-5.022434
13,Q17within_sys_sol,-4.269189,-4.272259,-4.266119
7,Q10maximizing,-3.924486,-3.925199,-3.923772
25,Low,3.900329,3.89818,3.902479
19,Q24eureka_question,-3.676867,-3.67872,-3.675014
26,Medium,-3.025605,-3.027684,-3.023527


## Model with All Features

In [55]:
task_composition_lasso_weights = np.array(list(model_lasso_taskcomp.coef_) + list(np.zeros(len((conv_features.columns)))))

In [56]:
all_features = list(task_features.columns) + list(team_composition_features.columns) + list(conv_features.columns)
model_lasso_all, mlall_q2, mlall_feature_coefficients = fit_regularized_linear_model(desired_target, all_features, lasso = True, tune_alpha = False, prev_coefs=task_composition_lasso_weights, prev_alpha = model_lasso_taskcomp.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_all], "model_type": ["Lasso"], "features_included": ["All Features"], "alpha": [model_lasso_all.alpha.round(4)], "q_squared": [mlall_q2]})], ignore_index=True)


Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[ -5.79223652   1.11461219  -4.59923991 -10.90929629   6.51072887
   1.41051409  -5.55810321  -4.92419312   0.           0.34544965
  -3.3273581   -0.          -0.          -5.19774077   0.43631747
   0.           0.           0.01885911  -1.90478531  -2.58442317
   0.           0.           6.09852116   0.           0.
   4.66647242  -1.88026486   1.8416021    0.08273441  -0.24791813
   3.50283305   2.96999932   2.91540107   0.85457075   1.10479441
  -2.06313094  -1.44681386   1.39559587  -1.44648177   0.9444969
  -4.58102919  -2.73265003  -0.69558918  -2.00219006   1.21837352
  -1.06538417  -0.60093066  -0.50125809   0.2169949    1.17381739
  -0.74488659  -2.10591977   1.69334165   0.57091067   1.21584884
   1.10131097   0.57618686   1.42925186   0.4910315   -2.97158285
   1.8434985   -3.5457053   -0.63299814   2.0850506    1.84789392
  -0.12853093   0.           0.           0.           0.
   0.        ]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Q^2: 0.24685039437110123


  model = cd_fast.enet_coordinate_descent(


In [57]:
sort_by_mean_abs(display_feature_coefficients(mlall_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
3,Q6type_5_cc,-10.629257,-10.631769,-10.626745
4,Q7type_7_battle,6.500138,6.498818,6.501458
6,Q9divisible_unitary,-5.968285,-5.969948,-5.966621
0,Q1concept_behav,-5.843287,-5.845244,-5.841330
22,Q5creativity_input_1,5.684223,5.682755,5.685691
...,...,...,...,...
8,Q11optimizing,0.000000,0.000000,0.000000
15,Q19time_solvability,0.000000,0.000000,0.000000
21,Q21intellective_judg_1,0.000000,0.000000,0.000000
23,Q25_type6_mixed_motive,0.000000,0.000000,0.000000


In [58]:
task_composition_ridge_weights = np.array(list(model_ridge_taskcomp.coef_) + list(np.zeros(len((conv_features.columns)))))

model_ridge_all, mrall_q2, mrall_feature_coefficients = fit_regularized_linear_model(desired_target, all_features, lasso = False, tune_alpha = True, prev_coefs=task_composition_ridge_weights, prev_alpha = model_ridge_taskcomp.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_all], "model_type": ["Ridge"], "features_included": ["All Features"], "alpha": [model_ridge_all.alpha.round(4)], "q_squared": [mrall_q2]})], ignore_index=True)

Setting alpha to previous...
0.007655275360661545
Setting coefficients ....
[-5.32738249  0.64540796 -3.28928629 -5.31419371  5.14526982  1.74926225
 -6.13190434 -3.90915581  1.83024734  3.21367354 -2.94314292  0.39873251
 -1.31855952 -3.64475766  1.02542803  1.66658167  0.57992196 -1.02628494
 -0.68962002 -3.67574071  1.85692479 -1.05167451  8.92753095  0.
 -0.93239825  3.75571064 -2.8233124   1.8616657   0.14134082 -0.20219578
  3.57274876  3.04570855  2.95483067  0.89774633  1.1168793  -2.11296278
 -1.4676879   1.45558114 -1.45992471  0.97340611 -4.66378127 -2.80469607
 -0.71101358 -2.02419582  1.28077852 -1.07931711 -0.6280697  -0.56010635
  0.25827777  1.19075048 -0.7629107  -2.1570234   1.74701926  0.63394683
  1.35500582  1.26240377  0.59813826  1.48989781  0.51097808 -3.04346719
  1.92421767 -3.6112379  -0.67038561  2.15788934  1.88459964 -0.15488598
  0.          0.          0.          0.          0.        ]
Q^2: 0.2461978315003066


In [59]:
sort_by_mean_abs(display_feature_coefficients(mrall_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,8.587952,8.586548,8.589356
6,Q9divisible_unitary,-6.254851,-6.255745,-6.253957
3,Q6type_5_cc,-5.156402,-5.157691,-5.155114
0,Q1concept_behav,-5.129995,-5.131063,-5.128927
4,Q7type_7_battle,5.089784,5.088549,5.091019
...,...,...,...,...
56,education_level_mean,0.440751,0.438208,0.443294
48,political_fiscal_mean,-0.079240,-0.082215,-0.076264
65,race_std,0.028944,0.026146,0.031742
70,PC5,0.008621,0.007006,0.010237


# Dataframe that summarizes all these experiments!

In [60]:
result_df.sort_values(by = "q_squared", ascending = False)

Unnamed: 0,model,model_type,features_included,alpha,q_squared
10,Lasso(alpha=0.007655275360661545),Lasso,All Features,0.0077,0.24685
11,Ridge(alpha=0.007655275360661545),Ridge,All Features,0.0077,0.246198
5,Lasso(alpha=0.007655275360661545),Lasso,Team Composition + Task Complexity,0.0077,0.242904
4,Ridge(alpha=0.007655275360661545),Ridge,Team Composition + Task Complexity,0.0077,0.241627
9,Ridge(alpha=0.007655275360661545),Ridge,Task Complexity + Communication,0.0077,0.237994
8,Lasso(alpha=0.007655275360661545),Lasso,Task Complexity + Communication,0.0077,0.237994
2,Ridge(alpha=0.007655275360661545),Ridge,Task Complexity,0.0077,0.233212
3,Lasso(alpha=0.007655275360661545),Lasso,Task Complexity,0.0077,0.233208
1,Lasso(alpha=1.3398158440138745),Lasso,Team Composition,1.3398,0.02271
0,Ridge(alpha=1.3398158440138745),Ridge,Team Composition,1.3398,0.012501


In [61]:
fit_regularized_linear_model(desired_target, ["Low", "Medium", "High"], lasso = False, tune_alpha = True)

Q^2: 0.012587259807185003


(Ridge(alpha=0.004536679362893395),
 0.012587259807185003,
             0         1         2         3         4         5         6     \
 Low     4.944166  4.916853  4.920654  4.906687  4.897582  4.946314  4.939433   
 Medium -3.954937 -3.924357 -3.907913 -3.958569 -3.914722 -3.939088 -3.945472   
 High   -0.989229 -0.992496 -1.012741 -0.948118 -0.982860 -1.007226 -0.993962   
 
             7         8         9     ...      1033      1034      1035  \
 Low     4.907659  4.937347  4.897582  ...  4.958212  4.956249  4.897582   
 Medium -3.957597 -3.941299 -3.914722  ... -3.983030 -3.909007 -3.914722   
 High   -0.950062 -0.996048 -0.982860  ... -0.975182 -1.047242 -0.982860   
 
             1036      1037      1038      1039      1040      1041      1042  
 Low     4.956249  5.092511  4.958212  4.948823  4.946531  5.092511  4.936813  
 Medium -3.909007 -4.012186 -3.983030 -3.964251 -3.918725 -4.012186 -3.928443  
 High   -1.047242 -1.080325 -0.975182 -0.984572 -1.027806 -1.080325 -

In [62]:
fit_regularized_linear_model(desired_target, ["playerCount"], lasso = True, tune_alpha = True)

Q^2: 0.00016444459245679166


(Lasso(alpha=0.13105415224069775),
 0.00016444459245679166,
                  0         1         2        3         4         5     \
 playerCount  0.848285  0.852245  0.828737  0.82015  0.859963  0.840447   
 
                 6         7         8         9     ...    1033    1034  \
 playerCount  0.84435  0.820931  0.842615  0.859963  ...  0.8288  0.8288   
 
                1035    1036      1037    1038      1039      1040      1041  \
 playerCount  0.8288  0.8288  0.883089  0.8288  0.834229  0.834229  0.781899   
 
                 1042  
 playerCount  0.84435  
 
 [1 rows x 1043 columns])

In [63]:
# model to look at composition without playercount
fit_regularized_linear_model(desired_target, list(set(team_composition_features.columns)-set(["playerCount"])), lasso = True, tune_alpha = True)


Q^2: 0.02696386519862548


(Lasso(alpha=1.1963002369349076),
 0.02696386519862548,
                            0         1         2         3         4     \
 birth_year_std        -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 gender_mean           -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 IRCS_IB_mean          -1.904136 -1.895378 -1.947324 -1.966282 -1.878330   
 birth_year_mean        0.000000  0.000000  0.000000  0.000000  0.000000   
 IRCS_GV_mean          -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 IRCS_IR_mean          -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 race_mean              0.000000  0.000000  0.000000  0.000000  0.000000   
 IRCS_GV_std            0.000000  0.000000  0.000000  0.000000  0.000000   
 IRCS_GS_std           -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 income_max_mean        1.241380  1.233378  1.280445  1.297556  1.218000   
 country_std           -0.000000 -0.000000 -0.000000 -0.000000 -0.000000   
 income_min_mean        0.623133

# Feature Importance

In [64]:
sort_by_mean_abs(display_feature_coefficients(mlall_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
3,Q6type_5_cc,-10.629257,-10.631769,-10.626745
4,Q7type_7_battle,6.500138,6.498818,6.501458
6,Q9divisible_unitary,-5.968285,-5.969948,-5.966621
0,Q1concept_behav,-5.843287,-5.845244,-5.841330
22,Q5creativity_input_1,5.684223,5.682755,5.685691
...,...,...,...,...
8,Q11optimizing,0.000000,0.000000,0.000000
15,Q19time_solvability,0.000000,0.000000,0.000000
21,Q21intellective_judg_1,0.000000,0.000000,0.000000
23,Q25_type6_mixed_motive,0.000000,0.000000,0.000000


In [65]:
def plot_top_n_features(data, n, filepath):
    # Calculate the absolute mean value and sort the DataFrame in descending order
    data['Absolute_Mean'] = data['Mean'].abs()
    top_n_features = data.sort_values(by='Absolute_Mean', ascending=False).head(n)

    # Define color mapping for the features
    color_map = {}
    name_map = {}
    for feature in task_features.columns:
        color_map[feature] = 'yellowgreen'
        name_map[feature] = "Task Feature"
    for feature in conv_features.columns:
        color_map[feature] = 'powderblue'
        name_map[feature] = "Conversation Feature"
    for feature in team_composition_features.columns:
        color_map[feature] = 'lightpink'
        name_map[feature] = "Team Composition Feature"

    # Create a horizontal bar graph
    plt.figure(figsize=(10, 6))

    handles = []

    for feature in top_n_features['Feature']:
        color = color_map.get(feature, 'k')  # Default to black if not in any list
        bars = plt.barh(feature, top_n_features[top_n_features['Feature'] == feature]['Mean'], color=color)
        handles.append(bars[0])

    # Customize the plot
    plt.xlabel('Mean Coefficient (Across LOO Cross Validation)', fontsize = 14)
    plt.title(f'Top {n} features for {desired_target} (min chats = {min_num_chats})', fontsize=20)
    plt.gca().invert_yaxis()  # Invert the y-axis to display the highest value at the top

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    # Create a legend outside the plot area with unique labels
    unique_features = []
    unique_labels = []
    for feature in top_n_features['Feature']:
        if name_map.get(feature, feature) not in unique_labels:
            unique_labels.append(name_map.get(feature, feature))
            unique_features.append(feature)

    legend_handles = [plt.Line2D([0], [0], color=color_map.get(feature, 'k'), lw=4, label=name_map.get(feature, feature)) for feature in unique_features]
    plt.legend(handles=legend_handles, loc='center left', fontsize = 14, bbox_to_anchor=(1, 0.5))

    # Add labels to the bars with increased text size and Mean rounded to 2 decimals, consistently inside the bar
    label_offset = 0.4  # Adjust this value for proper spacing
    for bar, value, feature in zip(handles, top_n_features['Mean'], top_n_features['Feature']):
        label_x = (max(value, 0) if value >= 0 else min(value, 0))
        bbox = bar.get_bbox()
        label_y = bbox.bounds[1] + label_offset
        if value >= 0:
            plt.text(label_x, label_y, f'{value:.2f}', va='center', fontsize=12)
        else:
            plt.text(label_x, label_y, f'{value:.2f}', ha='right', va='center', fontsize=12)

    # Show the plot
    plt.savefig(filepath + ".svg")
    plt.savefig(filepath + ".png")
    plt.show()

Questions:
- More deeply understand difference between LASSO and Ridge
- Better understand `alpha` hyperparameter
- Why doesn't more features mean a better R^2? (Wouldn't the model 'throw out' features that don't work?)