# Embedding
  In this notebook, we employ various methods to obtain embedding of text.

  Tips, as we have to train by GPU on google colab (realse 2023/08/18, check detail in https://colab.research.google.com/notebooks/relnotes.ipynb#scrollTo=IPBcIWWGZRTt), it is important to install right versions of some core packages.



  flair==0.11

  sentence-transformers==2.2.2






  


## Install necessary packages

In [None]:
!pip install flair==0.11

## Input data

In [None]:
#input data
from google.colab import drive
import pandas as pd
pd.set_option('display.max_columns', None)
drive.mount('/content/drive')


In [None]:
data = pd.read_csv('/content/drive/MyDrive/codeAndData/data/ModelData.csv')


In [None]:
!pip install contractions==0.1.73
!pip install emoji==2.8.0
!pip install nltk==3.8.1

In [None]:
#only necessary  preprocess for  advanced embedding methods
#for glove, we use extensive preprocessing as used in sentiment analysis

import re
import emoji
import contractions
import string

#remove usernames and links from text
def remove_usernames_links(tweet):
    tweet = re.sub(r'^RT', '', tweet)
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    tweet = re.sub(r'\s+', ' ', tweet) #remove multiple spaces
    return tweet
data['embedding_text'] = data['OriginalText'].apply(remove_usernames_links)


#remove usernames and links from text
def remove_hash(tweet):
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'\s+', ' ', tweet) #remove multiple spaces
    return tweet
data['embedding_text'] = data['embedding_text'].apply(remove_hash)



def text_cleaning(text):
    #dealing with emojis
    tweet = emoji.demojize(text, delimiters=("", ""))
    #remove multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

data['embedding_text'] = data['embedding_text'].apply(lambda x: text_cleaning(x))
#convert text to lowercase
data['embedding_text'] = data['embedding_text'].str.lower()
data['embedding_text'] = data['embedding_text'].str.strip()

In [None]:
data

In [None]:
import sys
print(sys.version)


## ELMO by tensorflow hub

Large dimension 1024

In [None]:
#extracting the right text #
Elmo_text = data['embedding_text'].tolist()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
# Load the ELMo module
#elmo = hub.load("https://tfhub.dev/google/elmo/3")


In [None]:
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


# Load the ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")
# Initialize GPU memory growth (optional)


# Create an empty list to store elmo sentence embeddings
sentence_embeddings = []

for sentence in tqdm(Elmo_text):
    # Generate ELMo embeddings for each word in the sentence
    embeddings = elmo.signatures["default"](tf.constant([sentence]))["elmo"]

    sentence_embedding = tf.reduce_mean(embeddings, axis=1)

    # Append the sentence embedding to the list
    sentence_embeddings.append(sentence_embedding)



In [None]:
# Convert the list of sentence embeddings to a numpy array
elmo1024_embeddings = np.array(sentence_embeddings, dtype=np.float32)

# 'sentence_embeddings' now contains the ELMo sentence embeddings for all the sentences
df_elmo = pd.DataFrame(elmo1024_embeddings.reshape(elmo1024_embeddings.shape[0],-1))


In [None]:
df_elmo

In [None]:
df_elmo

In [None]:
path = '/content/drive/MyDrive/codeAndData/data/1024elmo.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df_elmo.to_csv(f,index=False)

## twitter bert 768





In [None]:
import pandas as pd

import torch

from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:
#extracting the right text #
twitterbert_text = data['embedding_text'].tolist()


In [None]:
# in this part , we use huggingface model and flair together to obtain embedding
# esay and fast

from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
# init embedding
# check more info about model https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
embedding = TransformerDocumentEmbeddings('cardiffnlp/twitter-roberta-base-sentiment')


In [None]:
# similar process
# Create an empty tensor for storing  embeddings
import torch
s = torch.zeros(0)
from tqdm import tqdm


# Iterate through your list of tweets (txt)
for tweet in tqdm(twitterbert_text):
    sentence = Sentence(tweet)
    embedding.embed(sentence)
    sentence_embedding = sentence.embedding
    sentence_embedding = sentence_embedding.view(1, -1)
    # Concatenate the current embedding to the existing tensor
    s = torch.cat((s.to(device), sentence_embedding), 0)

In [None]:
# Convert the tensor to a NumPy array and then to a DataFrame
bert_twitter_embeddings = s.cpu().detach().numpy()
bert_twitter_embeddings = pd.DataFrame(bert_twitter_embeddings)

In [None]:
bert_twitter_embeddings

In [None]:
path = '/content/drive/MyDrive/codeAndData/data/berttwitter.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  bert_twitter_embeddings.to_csv(f,index=False)

## openai Embedding


In [None]:
!pip install tiktoken==0.5.1
!pip install openai==0.28.1

In [None]:
openai_txt = data['embedding_text'].tolist()

In [None]:
len(openai_txt)

In [None]:
import openai
import tiktoken
openai.api_key = "  "


In [None]:
# calculate embeddings
import time
embedding_encoding = "cl100k_base"

EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 200  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(openai_txt), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = openai_txt[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)
    #add sleep to avoid
    time.sleep(5)

df = pd.DataFrame({"text": openai_txt, "embedding": embeddings})

In [None]:
df

In [None]:
openai_embeddings = pd.DataFrame(list(df.embedding.values))


In [None]:
openai_embeddings

In [None]:
path = '/content/drive/MyDrive/codeAndData/data/openai.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  openai_embeddings.to_csv(f,index=False)

## Glove

In [None]:
#extracting the right text #
glove_text = data['embedding_text']
 ## txt is a list of tweets ##
glove_text = glove_text.tolist()

In [None]:
import torch
import torchtext
#need many (30+) minutes to download the model
glove = torchtext.vocab.GloVe(name="twitter.27B", dim=200)

In [None]:
glove.dim

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def glove_sentence_embeddings(sentences, glove, device):
    embeddings = []

    for sentence in sentences:
        # Tokenize the sentence into words
        words = sentence.split()

        # Initialize a tensor to store the embeddings
        sentence_embed = torch.zeros(glove.dim).to(device)  # Move the tensor to the GPU


        found_words = 0
        for word in words:
            if word in glove.stoi:
                word_embed = glove[word].to(device)  # Move the word embedding to the GPU
                sentence_embed += word_embed
                found_words += 1

        # Calculate the average (mean) embedding
        if found_words > 0:
            sentence_embed /= found_words

        embeddings.append(sentence_embed.cpu().detach().numpy())

    return embeddings


In [None]:
glove200_embeddings = glove_sentence_embeddings(glove_text, glove, device)

In [None]:
glove200_embeddings=pd.DataFrame(glove200_embeddings)

In [None]:
glove200_embeddings

In [None]:
path = '/content/drive/MyDrive/codeAndData/data/glove200.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  glove200_embeddings.to_csv(f,index=False)

## TFIDF



In [None]:
## Use extensively cleaned tweets
tfidf_text = data['clean_text']
 ## txt is a list of tweets ##
tfidf_text = tfidf_text.tolist()

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_text)


In [None]:
tfidf_matrix.shape

In [None]:
#def a fuction to maximum profile likelihood which makes an automatic selection based on the scree plot
def get_elbow(d):
  '''
  d vec of sorted explained variance

  q the dimension with larget loglikhood

  '''
  from scipy.stats import norm

  p = len(d)  # Assuming 'd' is your data, and 'p' is its length
  lq = np.zeros(p)  # Initialize log likelihood list with zeros

  for q in range(p):
    mu1 = np.mean(d[:q+1])
    mu2 = np.nan if q == p - 1 else np.mean(d[q+1:])  # Handle NaN when q == p
    sigma2 = (np.sum((d[:q+1] - mu1)**2) + np.sum((d[q+1:] - mu2)**2)) / (p - 1 - int(q < p))
    log_likelihood1 = np.sum(norm.logpdf(d[:q+1], loc=mu1, scale=np.sqrt(sigma2)))
    log_likelihood2 = np.sum(norm.logpdf(d[q+1:], loc=mu2, scale=np.sqrt(sigma2)))
    lq[q] = log_likelihood1 + log_likelihood2

  q = np.argmax(lq)
  import matplotlib.pyplot as plt


  #plot the loglikelihood
  plt.plot(range(1, p+1), lq)
  plt.xlabel('Number of dimension')
  plt.ylabel('Log_liklihood')
  plt.title('Profile Log-Likelihood')
  plt.show()

  return q+1

In [None]:
# we fisrt set a large number and then decide optimal number
k = 2000
lsa = TruncatedSVD(n_components=k)
tfidf_embeddings = lsa.fit_transform(tfidf_matrix)

In [None]:
explained_variance = lsa.explained_variance_ratio_

In [None]:
get_elbow(explained_variance)

In [None]:
# we fisrt set a large number and then decide optimal number
k = 173
lsa = TruncatedSVD(n_components=k)
tfidf_embeddings = lsa.fit_transform(tfidf_matrix)

In [None]:
tfidf_embeddings=pd.DataFrame(tfidf_embeddings)

In [None]:
tfidf_embeddings

In [None]:
path = '/content/drive/MyDrive/codeAndData/data/tfidf.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  tfidf_embeddings.to_csv(f,index=False)