In [None]:
#Import statements
import random
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV #built in class to provide tuning
from sklearn.model_selection import RepeatedKFold

In [None]:
!pip install shap==0.44.1

In [None]:
#Load dataset
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

In [None]:

def combine_embedding_with_table(embedding_name,modelling_variable):

    """
    Combine  embedding  with a basetable and performs some data preprocessing.

    Args:
    - embedding_name (str): the name of the embedding ('glove', 'elmo', 'berttwitter', 'sentencebert', 'universal').
    - modelling_variable (str): the name of  ('SWB', 'Positive_affect', 'Negative_affect').

    Returns:
    - DataFrame: The combined embedding and basetable

    """

    basetable = pd.read_csv("/content/drive/MyDrive/codeAndData/data/ModelData.csv")
    try:
        # Construct file path of embedding
        filename = f"/content/drive/MyDrive/codeAndData/data/{embedding_name}.csv"

        # Read the CSV file into a DataFrame
        emb_table = pd.read_csv(filename)

        # Concatenate the basetable and the embedding
        basetable = pd.concat([basetable, emb_table], axis=1)

        # List of column names to delet
        columns_to_delete = ['created_at', 'full_text',
                     'Attentive','Alert','Determined','Inspired','Active',
                     'Hostile','Ashamed','Upset','Afraid','Nervous',
                     'OriginalText','clean_text',
                     'neg',	'neu',	'pos',	'compound',	'TextBlob_score',
                             'days_since_COVID19']
        basetable = basetable.drop(columns=columns_to_delete)

        # List of column names to delete based on modelling_variable
        if modelling_variable == 'SWB':
            columns_to_delete = ['Positive_affect', 'Negative_affect']
        elif modelling_variable == 'Positive_affect':
            columns_to_delete = ['SWB', 'Negative_affect']
        elif modelling_variable == 'Negative_affect':
            columns_to_delete = ['SWB', 'Positive_affect']
        else:
            columns_to_delete = []
        basetable = basetable.drop(columns=columns_to_delete)



        print(f"Combined {embedding_name} with basetable.")
        print(f"Dropped columns: {columns_to_delete}")
        print("=" * 40)
        print(f"This code is used to build a model of {modelling_variable} using {embedding_name}")

        return basetable

    except FileNotFoundError:
        print(f"File '{embedding_name}' not found.")
        return basetable


In [None]:
##here input two , one is embedding name, another is the dependent variable to reaserch
#('tfidf','glove200', '1024elmo', 'berttwitter','openai')
embedding_name='openai'
basetable=combine_embedding_with_table('openai','SWB')

In [None]:
X = basetable.drop(['SWB'], axis = 1)
y = basetable['SWB']

#transform T/F to 0 1
X["is_quote_status"] = X["is_quote_status"].astype(int)
X["is_reply"] = X["is_reply"].astype(int)
X["possibly_sensitive"] = X["possibly_sensitive"].astype(int)

In [None]:
X

In [None]:
features_df = pd.DataFrame(columns=['url_count', 'points_count'])

In [None]:
features_df

In [None]:


rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=1)

for i, (train_index, test_index) in enumerate(rkf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(train_index,test_index)

        #sclaing part of features
    scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']

    #select the columns to scale
    X_train_scale = X_train[scale_columns]
    X_test_scale = X_test[scale_columns]

    X_train_unscale = X_train.drop(columns=scale_columns)
    X_test_unscale = X_test.drop(columns=scale_columns)

    scaler = StandardScaler().fit(X_train_scale)
    X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
    X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


    # Reset the indexes of both DataFrames
    X_train_unscale.reset_index(drop=True, inplace=True)
    X_train_transformed.reset_index(drop=True, inplace=True)
    X_test_unscale.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames together
    X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
    X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)



    ### Save the url_count and points_count columns to the url_points_df DataFrame
    url_points_test = X_test[['url_count', 'points_count']]
    features_df = pd.concat([features_df, url_points_test], axis=0)

In [None]:
features_df.reset_index(drop=True, inplace=True)


In [None]:
features_df

In [None]:

url_columns = ['Total SWB url_count', 'Positive affect url_count', 'Negative affect url_count']
points_columns = ['Total SWB points_count', 'Positive affect points_count', 'Negative affect points_count']

url_features_df = pd.DataFrame(columns=url_columns)
points_features_df = pd.DataFrame(columns=points_columns)

url_features_df['Total SWB url_count'] = features_df['url_count']
url_features_df['Positive affect url_count'] = features_df['url_count']
url_features_df['Negative affect url_count'] = features_df['url_count']
points_features_df['Total SWB points_count'] = features_df['points_count']
points_features_df['Positive affect points_count'] = features_df['points_count']
points_features_df['Negative affect points_count'] = features_df['points_count']

In [None]:
points_features_df

In [None]:
##get all shap values
all_SWB_shap_values = pd.read_csv("/content/drive/MyDrive/codeAndData/data/Test_results/all_SWB_shap_values.csv")
all_PA_shap_values = pd.read_csv("/content/drive/MyDrive/codeAndData/data/Test_results/all_PA_shap_values.csv")
all_NA_shap_values = pd.read_csv("/content/drive/MyDrive/codeAndData/data/Test_results/all_NA_shap_values.csv")

In [None]:
all_SWB_shap_values

In [None]:
all_SWB_shap_values


In [None]:
url_count_SWB = all_SWB_shap_values["url_count"]
url_count_PA = all_PA_shap_values["url_count"]
url_count_NA = all_NA_shap_values["url_count"]
url_counts_shap_df = pd.concat([url_count_SWB, url_count_PA, url_count_NA], axis=1, keys=['Total SWB url_count', 'Positive affect  url_count', 'Negative affect  url_count'])



In [None]:
url_counts_shap_df

In [None]:
# Concatenate the 'url_count' columns into a new DataFrame
points_count_SWB = all_SWB_shap_values["points_count"]
points_count_PA = all_PA_shap_values["points_count"]
points_count_NA = all_NA_shap_values["points_count"]
points_count_shap_df = pd.concat([points_count_SWB, points_count_PA, points_count_NA], axis=1, keys=['Total SWB points_count', 'Positive affect  points_count', 'Negative affect  points_count'])


In [None]:
points_count_shap_df

In [None]:

shap.summary_plot(url_counts_shap_df.values, features=url_features_df, feature_names=list(url_counts_shap_df.columns))

In [None]:
shap.summary_plot(points_count_shap_df.values, features=points_features_df, feature_names=list(points_count_shap_df.columns))

In [None]:
shap.summary_plot(points_count_shap_df.values, features=points_features_df, feature_names=list(points_count_shap_df.columns))