In [12]:
import pandas as pd
import numpy as np
import emoji
import warnings
import nltk
from nltk.corpus import stopwords    
import string

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Data/cleaned_by_language.csv")

In [4]:
en_df = df[df['language'] == 'en']
es_df = df[df['language'] == 'es']

In [5]:
en_comment = en_df['text only'].to_list()
es_comment = es_df['text only'].to_list()

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
en_stop_words = set(stopwords.words('english'))
en_filtered_comments = []

for comment in en_comment:
    # remove word starts with @
    comment = " ".join(filter(lambda x:x[0]!='@', comment.split()))
    
    # remove punctuations
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    # remove stopwords
    words = nltk.word_tokenize(comment)
    filtered_comment = [word for word in words if word.lower() not in en_stop_words]
    filtered_comment = ' '.join(filtered_comment)
    filtered_comment = comment
    en_filtered_comments.append(filtered_comment)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuhsinhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuhsinhuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
es_stop_words = set(stopwords.words('spanish'))
es_filtered_comments = []

for comment in es_comment:
    # remove word starts with @
    comment = " ".join(filter(lambda x:x[0]!='@', comment.split()))
    
    # remove punctuations
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    # remove stopwords
    words = nltk.word_tokenize(comment)
    filtered_comment = [word for word in words if word.lower() not in es_stop_words]
    filtered_comment = ' '.join(filtered_comment)
    filtered_comment = comment
    es_filtered_comments.append(filtered_comment)

## Text Embeddings using LASER - this pretrained model supports cross-lingual tasks and embeds in setence-level

In [8]:
pip install laserembeddings

Note: you may need to restart the kernel to use updated packages.


In [15]:
# run this in terminal first: python -m laserembeddings download-models
from laserembeddings import Laser

laser = Laser()
en_text_array = laser.embed_sentences(en_filtered_comments, lang='en')
es_text_array = laser.embed_sentences(es_filtered_comments, lang='es')

## Emoji Embeddings using Emoji2Vec

In [30]:
from gensim.models import KeyedVectors

en_emoji = en_df['emoji list'].to_list()

# Load pretrained emoji embeddings
emoji_model = KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)

# Initialize a list to store emoji embeddings
en_emoji_embedding = []

for emoji_list in en_emoji:
    emoji_list_embedding = []  # Initialize a list for embeddings of each emoji list
    for emoji in emoji_list:
        try:
            emoji_list_embedding.append(emoji_model[emoji])
        except KeyError:
            pass
    if len(emoji_list_embedding) != 0:
        emoji_list_embedding = np.concatenate(emoji_list_embedding, axis=0)
    en_emoji_embedding.append(emoji_list_embedding)


en_max_size = max(len(arr) for arr in en_emoji_embedding)
en_padded_arrays = [np.pad(arr, (0, en_max_size - len(arr)), 'constant') for arr in en_emoji_embedding]
en_emoji_array = np.vstack(en_padded_arrays)

In [31]:
es_emoji = es_df['emoji list'].to_list()

es_emoji_embedding = []

for emoji_list in es_emoji:
    emoji_list_embedding = []  # Initialize a list for embeddings of each emoji list
    for emoji in emoji_list:
        try:
            emoji_list_embedding.append(emoji_model[emoji])
        except KeyError:
            pass
    if len(emoji_list_embedding) != 0:
        emoji_list_embedding = np.concatenate(emoji_list_embedding, axis=0)

    es_emoji_embedding.append(emoji_list_embedding)

es_max_size = max(len(arr) for arr in es_emoji_embedding)
es_padded_arrays = [np.pad(arr, (0, es_max_size - len(arr)), 'constant') for arr in es_emoji_embedding]
es_emoji_array = np.vstack(es_padded_arrays)

## Concatenate the text embeddings and emoji embeddings (if there are more than one emoji, we concatenate all of them)

In [38]:
import numpy as np

# Concatenate along columns (horizontally)
en_embeddings = np.concatenate((en_text_array, en_emoji_array), axis=1)
es_embeddings = np.concatenate((es_text_array, es_emoji_array), axis=1)

### Padded_arrays may affect clustering result. What else can we do to normalize data with varying size of embeddings?

In [39]:
len(en_embeddings[0])

4624

In [40]:
len(es_embeddings[0])

9424