In [5]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import simpletransformers
from multiprocessing import Pool
from simpletransformers.classification import ClassificationModel

from src.scripts.tools import Tools

In [None]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [2]:
from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')

In [6]:
data_path = "/content/drive/MyDrive/twitter_vaccination/data/raw/daily_world_en_csv"
list_of_data = os.listdir(data_path)
list_of_data.sort()

In [None]:
def read_files(file_name):
    df = pd.read_csv(f"{data_path}/{file_name}")
    df = df[df['text'].notnull()]
    df = df[df['text'].apply(lambda x: len(TOKENIZER.tokenize(x)) >= 10)]
    return df[['text']]

In [None]:
with Pool(processes=8) as pool:
    tweets_with_dates = pool.map(read_files, list_of_data)

In [None]:
tools = Tools()
text_data_concatenated = tools.concatenate_data(tweets_with_dates, 5, concat_type='pd')['text'].values

100%|██████████| 146/146 [01:39<00:00,  1.46it/s]


# Predicting Sentiments

In [7]:
load_model = "/models/sentiment_models"
with open(f"{load_model}/best_model.db", 'rb') as f:
    model = pickle.load(f)

In [None]:
result_saving_path = "/data/processed/sentiment_analysis/all_world"

idx = 0
for batch in tqdm(list(chunks(text_data_concatenated, 1200))):
    naming = f"{result_saving_path}/between-{idx*1200}-{(idx+1)*1200}.csv"
    if os.path.exists(naming):
      idx += 1
      continue
    predictions, raw_outputs = model.predict(batch.reshape(-1,).tolist())
    probs = tf.nn.softmax(raw_outputs, axis=1).numpy().astype(np.float16)
    pd.DataFrame(probs, columns=['Rest', "Pro", 'Anti']).to_csv(naming, index=False)
    idx += 1