In [1]:
import pandas as pd
import numpy as np
import emoji
import warnings
import nltk
from nltk.corpus import stopwords    
import string

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Data/cleaned_by_language.csv")

In [4]:
en_df = df[df['language'] == 'en']
es_df = df[df['language'] == 'es']

In [5]:
en_comment = en_df['text only'].to_list()
es_comment = es_df['text only'].to_list()

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
en_stop_words = set(stopwords.words('english'))
en_filtered_comments = []

for comment in en_comment:
    # remove word starts with @
    comment = " ".join(filter(lambda x:x[0]!='@', comment.split()))
    
    # remove punctuations
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    # remove stopwords
    words = nltk.word_tokenize(comment)
    filtered_comment = [word for word in words if word.lower() not in en_stop_words]
    filtered_comment = ' '.join(filtered_comment)
    filtered_comment = comment
    en_filtered_comments.append(filtered_comment)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aorawancraprayoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aorawancraprayoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
es_stop_words = set(stopwords.words('spanish'))
es_filtered_comments = []

for comment in es_comment:
    # remove word starts with @
    comment = " ".join(filter(lambda x:x[0]!='@', comment.split()))
    
    # remove punctuations
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    # remove stopwords
    words = nltk.word_tokenize(comment)
    filtered_comment = [word for word in words if word.lower() not in es_stop_words]
    filtered_comment = ' '.join(filtered_comment)
    filtered_comment = comment
    es_filtered_comments.append(filtered_comment)

## Text Embeddings using LASER - this pretrained model supports cross-lingual tasks and embeds in setence-level

In [28]:
pip install laserembeddings

Defaulting to user installation because normal site-packages is not writeable
Collecting laserembeddings
  Using cached laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Installing collected packages: laserembeddings
Successfully installed laserembeddings-1.1.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import sys

In [9]:
!echo $PATH

/usr/bin:/Users/aorawancraprayoon/Library/Python/3.9/bin:/Users/aorawancraprayoon/anaconda3/bin:/Users/aorawancraprayoon/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/opt/X11/bin:/Library/TeX/texbin:/Applications/Postgres.app/Contents/Versions/latest/bin:/Users/aorawancraprayoon/anaconda3/bin:/Users/aorawancraprayoon/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/opt/X11/bin:/Li

In [10]:
print(sys.path)

['/Users/aorawancraprayoon/Desktop/EmojiResearch-WorldCup2022', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python39.zip', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/lib-dynload', '', '/Users/aorawancraprayoon/Library/Python/3.9/lib/python/site-packages', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages', '/Library/Python/3.9/site-packages']


In [21]:
!/usr/bin/python3 -m laserembeddings download-models

Downloading models into /Users/aorawancraprayoon/Library/Python/3.9/lib/python/site-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [22]:
# run this in terminal first: python -m laserembeddings download-models
from laserembeddings import Laser

laser = Laser()
en_text_array = laser.embed_sentences(en_filtered_comments, lang='en')
es_text_array = laser.embed_sentences(es_filtered_comments, lang='es')

In [24]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.2-cp39-cp39-macosx_11_0_arm64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 28.4 MB/s eta 0:00:01
[?25hCollecting scipy>=1.7.0
  Downloading scipy-1.11.3-cp39-cp39-macosx_12_0_arm64.whl (29.7 MB)
[K     |████████████████████████████████| 29.7 MB 23.6 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 15.4 MB/s eta 0:00:01
Installing collected packages: smart-open, scipy, gensim
Successfully installed gensim-4.3.2 scipy-1.11.3 smart-open-6.4.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Emoji Embeddings using Emoji2Vec

In [25]:
from gensim.models import KeyedVectors

en_emoji = en_df['emoji list'].to_list()

# Load pretrained emoji embeddings
emoji_model = KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)

# Initialize a list to store emoji embeddings
en_emoji_embedding = []

for emoji_list in en_emoji:
    emoji_list_embedding = []  # Initialize a list for embeddings of each emoji list
    for emoji in emoji_list:
        try:
            emoji_list_embedding.append(emoji_model[emoji])
        except KeyError:
            pass
    if len(emoji_list_embedding) != 0:
        emoji_list_embedding = np.concatenate(emoji_list_embedding, axis=0)
    en_emoji_embedding.append(emoji_list_embedding)


en_max_size = max(len(arr) for arr in en_emoji_embedding)
en_padded_arrays = [np.pad(arr, (0, en_max_size - len(arr)), 'constant') for arr in en_emoji_embedding]
en_emoji_array = np.vstack(en_padded_arrays)

In [71]:
en_emoji_array.shape

(280, 3600)

In [72]:
en_text_array.shape

(280, 1024)

In [26]:
es_emoji = es_df['emoji list'].to_list()

es_emoji_embedding = []

for emoji_list in es_emoji:
    emoji_list_embedding = []  # Initialize a list for embeddings of each emoji list
    for emoji in emoji_list:
        try:
            emoji_list_embedding.append(emoji_model[emoji])
        except KeyError:
            pass
    if len(emoji_list_embedding) != 0:
        emoji_list_embedding = np.concatenate(emoji_list_embedding, axis=0)

    es_emoji_embedding.append(emoji_list_embedding)

es_max_size = max(len(arr) for arr in es_emoji_embedding)
es_padded_arrays = [np.pad(arr, (0, es_max_size - len(arr)), 'constant') for arr in es_emoji_embedding]
es_emoji_array = np.vstack(es_padded_arrays)

## Concatenate the text embeddings and emoji embeddings (if there are more than one emoji, we concatenate all of them)

In [27]:
import numpy as np

# Concatenate along columns (horizontally)
en_embeddings = np.concatenate((en_text_array, en_emoji_array), axis=1)
es_embeddings = np.concatenate((es_text_array, es_emoji_array), axis=1)

### Padded_arrays may affect clustering result. What else can we do to normalize data with varying size of embeddings?

In [28]:
len(en_embeddings[0])

4624

In [29]:
len(es_embeddings[0])

9424

## Try clustering concatenated word + emoji embeddings

In [49]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [50]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [55]:
clustering, cluster_labels = mbkmeans_clusters(
	X=en_embeddings,
    k=7,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": en_df['text'],
    "tokens": [" ".join(text) for text in en_filtered_comments],
    "cluster": cluster_labels
})

For n_clusters = 7
Silhouette coefficient: 0.07
Inertia:363.6427202724333
Silhouette values:
    Cluster 1: Size:14 | Avg:0.33 | Min:0.20 | Max: 0.42
    Cluster 4: Size:61 | Avg:0.30 | Min:0.10 | Max: 0.40
    Cluster 5: Size:40 | Avg:0.26 | Min:0.08 | Max: 0.36
    Cluster 3: Size:16 | Avg:0.23 | Min:-0.00 | Max: 0.37
    Cluster 6: Size:11 | Avg:0.11 | Min:-0.10 | Max: 0.20
    Cluster 2: Size:31 | Avg:-0.09 | Min:-0.24 | Max: -0.00
    Cluster 0: Size:107 | Avg:-0.15 | Min:-0.26 | Max: -0.02


In [75]:
df_clusters[df_clusters['cluster']==2]['text'].tolist()

['You are the best lady😻🤩🙏👏♥️⚽🤠fifa.',
 'This nikka sick wit it🔥😂',
 "@shintia97s love over hate !! I'm sorry that you've got so much anger inside for another human. But I forgive you and send you love and support. 🫂🫂🤗🤗😘😘🏳️\u200d🌈🏳️\u200d🌈",
 '@afdhalulrizki42 😂 Stupid Football yes 😂',
 '@the_queen_of_adventure Haya bina ila lmondial 😂🇨🇲😍',
 'Wow nice moves robo😍😍😍😍😍😍😍😍 challenge accepted 🔥🔥🔥🔥',
 'African should stop joining FIFA games is such a shame aww😢😢😢😢',
 'I really wanna join them but I’m a Chinese😭😭😭',
 'Dude… YOUR team was so rude and had poor sportsmanship. This new generation think they are entitled😡 Any other country would have been honored to be in the world cup to play on the same field with these GREAT players from 🇭🇷 \n\nI hope this humbles them. Canada player were acting like little brats. Croatia shut them up and put them on time out. Literally OUT😂💪🏻🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🏆🏆🏆🏆⚽️⚽️⚽️🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷🇭🇷',
 'Missing Italy in the world cup😍😍😍🇮🇹🇮🇹🇮🇹',
 '@anchan_0722_ 😂😂😂',
 '@izayahl

In [87]:
for seed in range(5):
    kmeans = KMeans(
        n_clusters=7,
        max_iter=100,
        n_init=1,
        random_state=seed,
    ).fit(es_embeddings)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
print()

Number of elements assigned to each cluster: [ 37 113  10  20  36  17  47]
Number of elements assigned to each cluster: [ 24  82 120  23  22   5   4]
Number of elements assigned to each cluster: [  9 145  20   1  35  13  57]
Number of elements assigned to each cluster: [160   6  18  86   5   3   2]
Number of elements assigned to each cluster: [123  36  83   2  11  12  13]

