# input data

In [None]:
#Import statements
import random
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV #built in class to provide tuning
from sklearn.model_selection import RepeatedKFold
from numpy import arange
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
random.seed(1) #set a seed for reproducable result

In [None]:
def combine_embedding_with_table(embedding_name,modelling_variable):

    """
    Combine  embedding  with a basetable and performs some data preprocessing.

    Args:
    - embedding_name (str): the name of the embedding ('glove', 'elmo', 'berttwitter', 'sentencebert', 'universal').
    - modelling_variable (str): the name of  ('SWB', 'Positive_affect', 'Negative_affect').

    Returns:
    - DataFrame: The combined embedding and basetable

    """

    basetable = pd.read_csv("./data/ModelData.csv")
    try:
        # Construct file path of embedding
        filename = f"./data/{embedding_name}.csv"

        # Read the CSV file into a DataFrame
        emb_table = pd.read_csv(filename)

        # Concatenate the basetable and the embedding
        basetable = pd.concat([basetable, emb_table], axis=1)

        # List of column names to delet
       # List of column names to delet
        columns_to_delete = ['created_at', 'full_text',
                     'Attentive','Alert','Determined','Inspired','Active',
                     'Hostile','Ashamed','Upset','Afraid','Nervous',
                     'OriginalText','clean_text','neu' ]
        basetable = basetable.drop(columns=columns_to_delete)

        # List of column names to delete based on modelling_variable
        if modelling_variable == 'SWB':
            columns_to_delete = ['Positive_affect', 'Negative_affect']
        elif modelling_variable == 'Positive_affect':
            columns_to_delete = ['SWB', 'Negative_affect']
        elif modelling_variable == 'Negative_affect':
            columns_to_delete = ['SWB', 'Positive_affect']
        else:
            columns_to_delete = []
        basetable = basetable.drop(columns=columns_to_delete)



        print(f"Combined {embedding_name} with basetable.")
        print(f"Dropped columns: {columns_to_delete}")
        print("=" * 40)
        print(f"This code is used to build a model of {modelling_variable} using {embedding_name}")

        return basetable

    except FileNotFoundError:
        print(f"File '{embedding_name}' not found.")
        return basetable


# NA

In [None]:
##here input two , one is embedding name, another is the dependent variable to build a model
basetable=combine_embedding_with_table('openai','Negative_affect')

In [None]:
basetable.shape


In [None]:
basetable

In [None]:
#now create date
covid_start = pd.to_datetime('2020-01-30', utc = True)
basetable['created_at'] = covid_start + pd.to_timedelta(basetable['days_since_COVID19'], unit='D')
basetable['created_at'] = basetable['created_at'].dt.date


In [None]:
basetable = basetable.sort_values(by='created_at')

In [None]:
basetable

In [None]:
# delete days since covid 19
X = basetable.drop(['Negative_affect','created_at','neg','pos',	'compound',	'TextBlob_score','days_since_COVID19'], axis = 1)
y = basetable['Negative_affect']

X["is_quote_status"] = X["is_quote_status"].astype(int)
X["is_reply"] = X["is_reply"].astype(int)
X["possibly_sensitive"] = X["possibly_sensitive"].astype(int)

In [None]:
y

In [None]:
X

In [None]:
# Define the ratio for the split
split_ratio = 0.7  # 70% for training, 30% for testing

# Calculate the index to split the data
split_index = int(len(X) * split_ratio)

# Split the data into training and testing sets
X_train = X[:split_index]
y_train = y[:split_index]
X_test = X[split_index:]
y_test = y[split_index:]

In [None]:
X_train

In [None]:
#beloe the code is used without

In [None]:


model= svm.SVR()

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}


#sclaing part of features
scale_columns=['retweet_count','favorite_count',
                   'hour','url_count','mentions_count','questionmark_count',
                   'exclamationmark_count','points_count','uppercase_ratio',
                   'hashtag_count']

#select the columns to scale
X_train_scale = X_train[scale_columns]
X_test_scale = X_test[scale_columns]

X_train_unscale = X_train.drop(columns=scale_columns)
X_test_unscale = X_test.drop(columns=scale_columns)

scaler = StandardScaler().fit(X_train_scale)
X_train_transformed = pd.DataFrame(scaler.transform(X_train_scale), columns=scale_columns)
X_test_transformed = pd.DataFrame(scaler.transform(X_test_scale), columns=scale_columns)


# Reset the indexes of both DataFrames
X_train_unscale.reset_index(drop=True, inplace=True)
X_train_transformed.reset_index(drop=True, inplace=True)
X_test_unscale.reset_index(drop=True, inplace=True)
X_test_transformed.reset_index(drop=True, inplace=True)
# Concatenate the DataFrames together
X_train = pd.concat([X_train_transformed,X_train_unscale,], axis=1)
X_test = pd.concat([X_test_transformed,X_test_unscale], axis=1)


grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print('Config: %s' % grid_search.best_params_)

final_model = best_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
y_train_pred=final_model.predict(X_train)

y_pred = np.clip(y_pred, 0, 1)
y_train_pred== np.clip(y_train_pred, 0, 1)

model_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
model_r2 = r2_score(y_test, y_pred)
print('r2', model_r2)
print('RMSE', model_rmse)




In [None]:
predict_NA= np.concatenate((y_train_pred, y_pred), axis=0)

In [None]:
basetable['predict_NA']=predict_NA

In [None]:
basetable['created_at'] = pd.to_datetime(basetable['created_at'])

In [None]:
# Group by month and calculate the mean of 'Negative_affect', 'predict_neg', and 'neg'
mean_NA = basetable.resample('M', on='created_at')['Negative_affect'].mean()
mean_predict_NA = basetable.resample('M', on='created_at')['predict_NA'].mean()
mean_vader_neg = basetable.resample('M', on='created_at')['neg'].mean()


In [None]:
end_date = pd.to_datetime('2020-02-29')
first_months_data = basetable[basetable['created_at'] <= end_date]
# Calculate the mean for the desired columns
first_months_mean_NA = first_months_data['Negative_affect'].mean()
first_months_mean_predict_NA = first_months_data['predict_NA'].mean()
first_months_mean_vader_neg = first_months_data['neg'].mean()


In [None]:
#scaling
mean_NA=mean_NA/np.abs(first_months_mean_NA)
mean_predict_NA=mean_predict_NA/np.abs(first_months_mean_predict_NA)
mean_vader_neg=mean_vader_neg/np.abs(first_months_mean_vader_neg)

In [None]:
mean_NA=mean_NA[1:]
mean_predict_NA=mean_predict_NA[1:]
mean_vader_neg=mean_vader_neg[1:]

mean_NA[0]=1
mean_predict_NA[0]=1
mean_vader_neg[0]=1

In [None]:
import matplotlib.pyplot as plt

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Plot all three variables on the same plot
mean_NA.plot(label='Negative affect', linewidth=2.5,legend=True)
mean_predict_NA.plot(label='Predicted negative affect', linewidth=2.5,legend=True)
mean_vader_neg.plot(label='VADER negative', linewidth=2.5,legend=True)


# Add a vertical line at January 2021
plt.axvline(pd.to_datetime('2021-01-01'), color='black', linestyle='--')

ax.annotate('Training period', xy=(0.3, 0.38), fontsize=16, color='blue',xycoords='figure fraction')
ax.annotate('Deployment period', xy=(0.7, 0.38), fontsize=16, color='blue',xycoords='figure fraction')

# Add a grid
ax.grid(True, linestyle='--', alpha=0.7)

# Customize labels and title
ax.set_xlabel('')
ax.tick_params(labelsize=16)
# Customize labels and title
plt.ylabel('Values',fontsize=16)

# Add a legend with a border
#ax.legend(loc='upper right', frameon=False, framealpha=0.5)

# Rotate x-axis labels for readability
plt.xticks(rotation=45)

# Tighten layout for better spacing
plt.tight_layout()
plt.legend(fontsize='large')

plt.savefig('TrackNAScale.png')


# Show the plot
plt.show()