In [None]:
import os
import h5py
import numpy as np
import pandas as pd
from multiprocessing import Pool
from sentence_transformers import SentenceTransformer

from src.scripts.read_data import ReadData

In [None]:
from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')

## Loading Data

In [None]:
world_data_path = "/data/raw/daily_world_en_csv"
output_dir = "/data/processed/embeddings/world_anti_embeddings"
dataframes_path = "/data/processed/dataframes"

In [None]:
read_data_world = ReadData(world_data_path, ['id', 'text'], filter_tweets=True)
read_data_world.read_csvs_and_combine_data()

world_data = read_data_world.data
world_data_sentiments = pd.read_parquet(f"{dataframes_path}/world_data_sentiments_raw.parquet").drop(columns=['created_at'])

100%|██████████| 146/146 [01:48<00:00,  1.35it/s]


In [None]:
mapping = {0:"Rest", 1:"Pro", 2:"Anti"}

max_columns = np.argmax(world_data_sentiments.iloc[:, 1:].values, axis=1)
max_values = np.max(world_data_sentiments.iloc[:, 1:].values, axis=1)
world_data_sentiments = world_data_sentiments.drop(columns=['Rest', 'Pro', 'Anti'])
world_data_sentiments['label'] = np.array([mapping[i] for i in max_columns])
world_data_sentiments = world_data_sentiments[(max_values >= 0.99) & (world_data_sentiments['label'] == "Anti")].reset_index(drop=True)

In [None]:
world_data = world_data_sentiments.join(world_data.set_index('id'), on='id').drop(columns=['label'])
del world_data_sentiments

In [None]:
world_data.to_parquet("/data/raw/world_anti_tweets_and_ids.parquet", index=False)

# Getting Embeddings of World Anti Tweets

In [None]:
model = SentenceTransformer("digitalepidemiologylab/covid-twitter-bert-v2", device='xla')

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/digitalepidemiologylab_covid-twitter-bert-v2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
h5file_path = f"{output_dir}/world_anti_embeddings.hdf5"
if not os.path.exists(h5file_path):
    embedding_h5file = h5py.File(h5file_path, "w")
    dset = embedding_h5file.create_dataset("embeddings", (9760275, 1024), chunks=(64, 1024)) # , compression="gzip"
    dset.attrs['length'] = 0
else:
    embedding_h5file = h5py.File(h5file_path, "r+")
    dset = embedding_h5file['embeddings']

In [None]:
batches = list(tools.create_chunks(world_data['text'].values.tolist(), 4096))
for idx, batch in enumerate(tqdm(batches)):

    if os.path.isfile(f"{output_dir}/embedding_{idx}.npy"):
        continue

    word_vectors = model.encode(batch, batch_size=1024)

    dset[dset.attrs['length']:dset.attrs['length']+word_vectors.shape[0]] = word_vectors
    dset.attrs['length'] += word_vectors.shape[0]

embedding_h5file.close()

100%|██████████| 2383/2383 [00:00<00:00, 3567.17it/s]
