# input data

In [None]:
#Import statements
import random
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV #built in class to provide tuning
from sklearn.model_selection import RepeatedKFold
from numpy import arange
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
random.seed(1) #set a seed for reproducable result

In [None]:
#Load dataset
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

In [None]:
def combine_embedding_with_table(embedding_name,modelling_variable):

    """
    Combine  embedding  with a basetable and performs some data preprocessing.

    Args:
    - embedding_name (str): the name of the embedding ('glove', 'elmo', 'berttwitter', 'sentencebert', 'universal').
    - modelling_variable (str): the name of  ('SWB', 'Positive_affect', 'Negative_affect').

    Returns:
    - DataFrame: The combined embedding and basetable

    """

    basetable = pd.read_csv("/content/drive/MyDrive/codeAndData/data/ModelData.csv")
    try:
        # Construct file path of embedding
        filename = f"/content/drive/MyDrive/codeAndData/data/{embedding_name}.csv"

        # Read the CSV file into a DataFrame
        emb_table = pd.read_csv(filename)

        # Concatenate the basetable and the embedding
        basetable = pd.concat([basetable, emb_table], axis=1)

        # List of column names to delet
        columns_to_delete = ['created_at', 'full_text',
                     'Attentive','Alert','Determined','Inspired','Active',
                     'Hostile','Ashamed','Upset','Afraid','Nervous',
                     'OriginalText','clean_text',
                     'neg',	'neu',	'pos',	'compound',	'TextBlob_score',
                             'days_since_COVID19']
        basetable = basetable.drop(columns=columns_to_delete)

        # List of column names to delete based on modelling_variable
        if modelling_variable == 'SWB':
            columns_to_delete = ['Positive_affect', 'Negative_affect']
        elif modelling_variable == 'Positive_affect':
            columns_to_delete = ['SWB', 'Negative_affect']
        elif modelling_variable == 'Negative_affect':
            columns_to_delete = ['SWB', 'Positive_affect']
        else:
            columns_to_delete = []
        basetable = basetable.drop(columns=columns_to_delete)



        print(f"Combined {embedding_name} with basetable.")
        print(f"Dropped columns: {columns_to_delete}")
        print("=" * 40)
        print(f"This code is used to build a model of {modelling_variable} using {embedding_name}")

        return basetable

    except FileNotFoundError:
        print(f"File '{embedding_name}' not found.")
        return basetable


# Feature Selection

In [None]:
##here input two , one is embedding name, another is the dependent variable to reaserch
#('tfidf','glove200', '1024elmo', 'berttwitter','openai')
embedding_name='openai'
basetable=combine_embedding_with_table('openai','SWB')


In [None]:
basetable

In [None]:
basetable['SWB'].describe()

# Models

In [None]:
X = basetable.drop(['SWB'], axis = 1)
y = basetable['SWB']

#transform T/F to 0 1
X["is_quote_status"] = X["is_quote_status"].astype(int)
X["is_reply"] = X["is_reply"].astype(int)
X["possibly_sensitive"] = X["possibly_sensitive"].astype(int)

In [None]:
X

In [None]:
y

### Elastic-Net

In [None]:
tuned_parameters=[]
rmse_scores=[]
r2_scores=[]

# define model
model =  ElasticNet(random_state=1)

# define grid
grid = dict()
grid['alpha'] = [1e-3,1e-2, 1e-1, 0.0, 1.0, 10.0]
grid['l1_ratio'] = arange(0, 1.1, 0.1)

rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)

for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)



    #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']
    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)


    grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=2, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)
    grid_search.fit(X_train, y_train)


    best_model = grid_search.best_estimator_
    print('Config: %s' % grid_search.best_params_)
    tuned_parameters.append(grid_search.best_params_)

    final_model = best_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    y_pred = np.clip(y_pred, -1, 1)
    model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)
    print('r2',model_r2)
    print('RMSE',model_rmse)



    rmse_scores.append(model_rmse)
    r2_scores.append(model_r2)



elastic_results_df = pd.DataFrame({ "Elastic_R2": r2_scores,"Elastic_RMSE": rmse_scores,"Elastic_parameter":tuned_parameters})

In [None]:
# Calculate and print the mean RMSE and R2 scores across all repeats and folds
mean_rmse = round(elastic_results_df["Elastic_RMSE"].mean(),4)
mean_r2 = round(elastic_results_df["Elastic_R2"].mean(),4)
print("mean R2 Score:", mean_r2)
print("mean RMSE:", mean_rmse)


print("Results DataFrame:")
print(elastic_results_df)

In [None]:
embedding_name

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBelastic{embedding_name}.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  elastic_results_df.to_csv(f,index=False)

### Support Vector Machines (RFB)

In [None]:
tuned_parameters=[]
rmse_scores=[]
r2_scores=[]


model= svm.SVR()

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}


rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)
for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print(train_index,test_index)


    #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']
    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)


    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print('Config: %s' % grid_search.best_params_)
    tuned_parameters.append(grid_search.best_params_)

    final_model = best_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    y_pred = np.clip(y_pred, -1, 1)
    model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)

    print('r2',model_r2)

    print('RMSE',model_rmse)


    rmse_scores.append(model_rmse)
    r2_scores.append(model_r2)

svm_results_df = pd.DataFrame({ "SVM_R2": r2_scores,"SVM_RMSE": rmse_scores,"SVM_parameter":tuned_parameters})




In [None]:
# Calculate and print the mean RMSE and R2 scores across all repeats and folds
mean_rmse = round(svm_results_df["SVM_RMSE"].mean(),4)
mean_r2 = round(svm_results_df["SVM_R2"].mean(),4)
print("mean R2 Score:", mean_r2)
print("mean RMSE:", mean_rmse)


# Display the results DataFrame
print("Results DataFrame:")
print(svm_results_df)

In [None]:
embedding_name

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBSVM{embedding_name}.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  svm_results_df.to_csv(f,index=False)

### Random Forest

In [None]:
tuned_parameters=[]
rmse_scores=[]
r2_scores=[]

model = RandomForestRegressor(max_features = 'sqrt', n_estimators=500, random_state=1)

In [None]:

rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)
for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)

    #no need parameters tuning

    final_model = model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    y_pred = np.clip(y_pred, -1, 1)
    model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)
    print('r2',model_r2)
    print('RMSE',model_rmse)


    rmse_scores.append(model_rmse)
    r2_scores.append(model_r2)

In [None]:
rf_results_df = pd.DataFrame({"RF_R2": r2_scores,"RF_RMSE": rmse_scores})

In [None]:
# Calculate and print the mean RMSE and R2 scores across all repeats and folds
mean_rmse = round(rf_results_df["RF_RMSE"].mean(),4)
mean_r2 = round(rf_results_df["RF_R2"].mean(),4)
print("mean R2 Score:", mean_r2)
print("mean RMSE:", mean_rmse)


# Display the results DataFrame
print("Results DataFrame:")
print(rf_results_df)

In [None]:
embedding_name

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBRF{embedding_name}.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  rf_results_df.to_csv(f,index=False)

### NN

In [None]:
model =  MLPRegressor(random_state=1)
tuned_parameters=[]
rmse_scores=[]
r2_scores=[]

# defining parameter range
param_grid = {'hidden_layer_sizes': [2,4,6,8,10,12,14,16,18,20],
              'batch_size':[32],
             'alpha': [10**(-4), 10**(-3.5), 10**(-3), 10**(-2.5), 10**(-2), 10**(-1.5), 10**(-1), 10**(-0.5), 0]}


rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)

for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print(train_index,test_index)


    #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']

    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)


    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print('Config: %s' % grid_search.best_params_)
    tuned_parameters.append(grid_search.best_params_)

    final_model = best_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    y_pred = np.clip(y_pred, -1, 1)
    model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)
    print('r2',model_r2)
    print('RMSE',model_rmse)


    rmse_scores.append(model_rmse)
    r2_scores.append(model_r2)
# Create a DataFrame from the individual scores
nn_results_df = pd.DataFrame({"NN_R2": r2_scores,"NN_RMSE": rmse_scores,"NN_parameter":tuned_parameters})

In [None]:
# Calculate and print the mean RMSE and R2 scores across all repeats and folds
mean_rmse = round(nn_results_df["NN_RMSE"].mean(),4)
mean_r2 = round(nn_results_df["NN_R2"].mean(),4)
print("mean R2 Score:", mean_r2)
print("mean RMSE:", mean_rmse)


# Display the results DataFrame
print("Results DataFrame:")
print(nn_results_df)

In [None]:
embedding_name

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBNN{embedding_name}.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  nn_results_df.to_csv(f,index=False)

### XGBOOST

In [None]:
!pip install xgboost==1.7.6

In [None]:
import xgboost as xgb

In [None]:
embedding_name

In [None]:
# define model
model = xgb.XGBRegressor(tree_method="gpu_hist", random_state=1)

tuned_parameters=[]
rmse_scores=[]
r2_scores=[]

max_depth = [2,3,5,7,10]
eta = [0.025, 0.05, 0.1, 0.2, 0.3]
gamma = [0, 0.1, 0.2, 0.3, 0.4, 1.0, 1.5, 2.0]

param_grid = {'max_depth': max_depth,
        'eta': eta,
        'gamma': gamma}

In [None]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)
for i, (train_index, test_index) in enumerate(rkf.split(X)):
  if i>=4:
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)

In [None]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)

for i, (train_index, test_index) in enumerate(rkf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)


    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print('Config: %s' % grid_search.best_params_)

    tuned_parameters.append(grid_search.best_params_)


    final_model = best_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)
    y_pred = np.clip(y_pred, -1, 1)

    model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_r2 = r2_score(y_test, y_pred)
    print('r2', model_r2)
    print('RMSE', model_rmse)


    # Append individual scores to the lists
    rmse_scores.append(model_rmse)
    r2_scores.append(model_r2)

    xgb_results_df = pd.DataFrame({"XGB_R2": r2_scores, "XGB_RMSE": rmse_scores ,"XGB_parameter":tuned_parameters})

    path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBXGBoost4{embedding_name}.csv'
    with open(path, 'w', encoding = 'utf-8-sig') as f:
      xgb_results_df.to_csv(f,index=False)




In [None]:
# Calculate and print the mean RMSE and R2 scores across all repeats and folds
mean_rmse = round(xgb_results_df["XGB_RMSE"].mean(),4)
mean_r2 = round(xgb_results_df["XGB_R2"].mean(),4)
print("mean R2 Score:", mean_r2)
print("mean RMSE:", mean_rmse)


print("Results DataFrame:")
print(xgb_results_df)

In [None]:
embedding_name

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBXGBoost{embedding_name}.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  xgb_results_df.to_csv(f,index=False)

# Variable Importance

In [None]:
parameter_df = pd.read_csv("/content/drive/MyDrive/codeAndData/data/Test_results/SWBNNopenai.csv")

In [None]:
parameter_df

In [None]:
!pip install xgboost==1.7.6
!pip install shap==0.44.1

In [None]:
import xgboost as xgb
import shap
import ast

In [None]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)
treeshap_tuned_parameters=[]
treeshap_rmse_scores=[]
treeshap_r2_scores=[]

for i, (train_index, test_index) in enumerate(rkf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)

        #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']

    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)

In [None]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)
treeshap_tuned_parameters=[]
treeshap_rmse_scores=[]
treeshap_r2_scores=[]

for i, (train_index, test_index) in enumerate(rkf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)

        #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']

    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)

    best_model = MLPRegressor(**ast.literal_eval(parameter_df['NN_parameter'][i]),random_state=1)
    final_model = best_model.fit(X_train, y_train)
    y_pred_train = final_model.predict(X_train)
    y_pred_train = np.clip(y_pred_train, -1, 1)

    y_pred_test = final_model.predict(X_test)
    y_pred_test = np.clip(y_pred_test, -1, 1)

    print('R2:',r2_score(y_test, y_pred_test))

    #tree shap
    #tree shap
    tree_shap = xgb.XGBRegressor(tree_method="gpu_hist", random_state=1)
    max_depth = [12,14,16,18]
    #eta = [0.025, 0.05, 0.1, 0.2, 0.3]
    #gamma = [0, 0.1, 0.2, 0.3, 0.4, 1.0, 1.5, 2.0]
    param_grid = {'max_depth': max_depth,
        #'eta': eta,
        #'gamma': gamma
                  }


    grid_search = GridSearchCV(estimator=tree_shap, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', n_jobs=1)
    grid_search.fit(X_train, y_pred_train)

    best_treeshap = grid_search.best_estimator_
    print('Config: %s' % grid_search.best_params_)
    treeshap_tuned_parameters.append(grid_search.best_params_)


    final_treeshap = best_treeshap.fit(X_train, y_pred_train)
    # Use Tree SHAP to explain predictions
    explainer = shap.TreeExplainer(final_treeshap)
    shap_values = explainer.shap_values(X_test)

    y_tree_pred_test = final_treeshap.predict(X_test)
    y_tree_pred_test = np.clip(y_tree_pred_test, -1, 1)

    surr_rmse = np.sqrt(mean_squared_error(y_pred_test, y_tree_pred_test))
    surr_r2 = r2_score(y_pred_test, y_tree_pred_test)

    # Append individual scores to the lists
    treeshap_rmse_scores.append(surr_rmse)
    treeshap_r2_scores.append(surr_r2)
    print(surr_r2)

    xgb_tree_results_df = pd.DataFrame({"surr_R2": treeshap_r2_scores, "surr_RMSE": treeshap_rmse_scores ,"surr_parameter":treeshap_tuned_parameters})

    #path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBImportanceParameterTable.csv'
    #with open(path, 'w', encoding = 'utf-8-sig') as f:
      #xgb_tree_results_df.to_csv(f,index=False)

    shap_values = pd.DataFrame(shap_values)
    path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBImportance{i}.csv'
    with open(path, 'w', encoding = 'utf-8-sig') as f:
      shap_values.to_csv(f,index=False)


In [None]:
##plot shap

# Initialize an empty list to store all the DataFrames
all_shap_values = []

for i in range(10):
    # Construct the path to the CSV file for the current value of i
    path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBImportance{i}.csv'

    # Read the CSV file into a DataFrame
    df = pd.read_csv(path)

    # Append the DataFrame to the list of DataFrames
    all_shap_values.append(df)

all_shap_values = pd.concat(all_shap_values, ignore_index=True)

In [None]:
all_shap_values.columns = X_train.columns


In [None]:
all_shap_values

In [None]:
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/all_SWB_shap_values.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
      all_shap_values.to_csv(f,index=True)

In [None]:
#
mean_values = all_shap_values.abs().mean()
embedding_shap = mean_values[13:]
embedding_shap = embedding_shap.mean()

In [None]:
plot_data=mean_values[:14]
plot_data[13]=embedding_shap
plot_data = plot_data.rename({'0': 'textual_representation'})
plot_data

In [None]:
ranked_data = plot_data.rank(ascending=False).astype(int)
ranked_data=pd.DataFrame(ranked_data)
path = f'/content/drive/MyDrive/codeAndData/data/Test_results/SWBImportanceRank.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
      ranked_data.to_csv(f,index=True)

In [None]:
# Sorting the data by feature importance in ascending order
plot_data = plot_data.sort_values(ascending=True)

# Define colors based on whether the index contains textual representation or not
colors = ['lightcoral' if str(index)=="textual_representation" else 'skyblue' for index in plot_data.index]

# Plotting the horizontal bar plot
plt.figure(figsize=(10, 6))
plot_data.plot(kind='barh', color=colors, edgecolor='black')
plt.xlabel('Mean absolute SHAP value', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()