## Method 1
### **`Using BERT-type models`**

In [1]:
import numpy as np
import pandas as pd 
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import hf_hub_download, list_repo_files
from colorama import Style, Fore, Back
import torch
import os
import time

import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


### GET Model

In [2]:
repo_id = "tabularisai/multilingual-sentiment-analysis"
local_dir = "/mnt/d/Desktop/HuggingFaceModels/Multi_SA"

### DO NOT RUN AGAIN

In [None]:
filenames = list_repo_files(repo_id)
for filename in filenames:
    print(hf_hub_download(repo_id = repo_id, local_dir = local_dir, filename=f"{filename}"))

In [4]:
tokenizer = AutoTokenizer.from_pretrained(local_dir)
model = AutoModelForSequenceClassification.from_pretrained(local_dir)

def predict_sentiment(text):
    input_text = tokenizer(text, return_tensors = "pt", \
                           padding = "max_length", truncation = True, max_length = 128)
    logger.debug(f'{Style.BRIGHT}{Fore.BLUE}{input_text}{Style.RESET_ALL}')
    with torch.no_grad():
        output_ = model(**input_text)
        logger.debug(f'{Style.BRIGHT}{Fore.GREEN}{output_}{Style.RESET_ALL}')
    prob = torch.nn.functional.softmax(output_.logits, dim = -1)
    sentiment_map = {0: "VERY BAD", 1: "BAD", 2: "NEUTRAL", 3: "GOOD", 4: "VERY GOOD"}
    return [sentiment_map[p] for p in torch.argmax(prob, dim = -1).tolist()]

### RUN 1

In [7]:
start_time = time.time()
sentences = []
with open("5G_logs_v1.txt", "r") as file:
    for line in file:
        sentences.append(line)
        
result = predict_sentiment(sentences)
for sentence, verdict in zip(sentences, result):
    logger.info(f"{Style.BRIGHT}Verdict: {verdict}{Style.RESET_ALL}")
    logger.info(f"{Style.BRIGHT}{Fore.BLUE}{sentence}{Style.RESET_ALL}")
    logger.info(f'\n')
end_time = time.time()
logger.info(f'{Style.BRIGHT}{Fore.GREEN}Avg Time: {(end_time - start_time)/len(sentence)}{Style.RESET_ALL}')
logger.info(f'\n')

RuntimeError: [enforce fail at alloc_cpu.cpp:117] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 19660800000 bytes. Error code 12 (Cannot allocate memory)

### RUN 2

In [None]:
start_time = time.time()
sentences = []
with open("5G_logs_v2.txt", "r") as file:
    for line in file:
        sentences.append(line)
        
result = predict_sentiment(sentences)
for sentence, verdict in zip(sentences, result):
    logger.info(f"{Style.BRIGHT}Verdict: {verdict}{Style.RESET_ALL}")
    logger.info(f"{Style.BRIGHT}{Fore.BLUE}{sentence}{Style.RESET_ALL}")
    logger.info(f'\n')
end_time = time.time()
logger.info(f'{Style.BRIGHT}{Fore.GREEN}Avg Time: {(end_time - start_time)/len(sentence)}{Style.RESET_ALL}')
logger.info(f'\n')

## Method 2: caching results

### **`Building cache`**

In [132]:
import glob
import json
from collections import defaultdict

filenames = glob.glob("*.txt")
versions = []
prefix = "5G_logs_v"
for filename in filenames:
    versions.append(int(filename.split(".")[0][len(prefix):]))
versions.sort()
latest = versions[-1]
versions.pop(-1)

2

In [133]:
cache, sentence_list = defaultdict(list), []
for filename in filenames:
    fv = int(filename.split(".")[0][len(prefix):])
    if fv != latest:
        with open(f"{filename}", "r") as file:
            for sentence in file:
                sentence_list.append(sentence)
results = predict_sentiment(sentence_list)
for sentence, sentiment in zip(sentence_list, results):
    insert_ = True
    if len(cache[hash(sentence)]) != 0:
        for sentenceX, sentiment in cache[hash(sentence)]:
            if sentence == sentenceX:
                insert_ = False
                logging.debug(f"{Style.BRIGHT}{Fore.RED}Duplicate{Style.RESET_ALL}")
                break
    if insert_: cache[hash(sentence)].append((sentence, sentiment))

with open("cache", "w") as cfile:
    json.dump(cache, cfile, indent = 5)

### RUN 1

In [134]:
start_time = time.time()
cache = {}
with open("cache", "r") as cfile:
    cache = json.load(cfile)

for filename in filenames:
    fv = int(filename.split(".")[0][len(prefix):])
    if fv == latest:
        sentence_list, sentiment_list = [], []
        with open(f"{filename}", "r") as file:
            for sentence in file:
                sentence_list.append(sentence)
        sentiment_list = [None for _ in range(len(sentence))]
        new_list, new_list_ids = [], []
        for idx, sentence in enumerate(sentence_list):
            hash_key, is_new = str(hash(sentence)), True
            if hash_key in cache:
                for sentenceX, sentiment in cache[hash_key]:
                    if sentence == sentenceX: 
                        sentiment_list[idx] = sentiment 
                        is_new = False
                        break
            if is_new: 
                new_list.append(sentence)
                new_list_ids.append(idx)
        new_results = predict_sentiment(new_list)
        for ptr, ids in enumerate(new_list_ids):
            sentiment_list[ids] = new_results[ptr]
        break 
        
for sentence, verdict in zip(sentence_list, sentiment_list):
    logger.info(f"{Style.BRIGHT}Verdict: {verdict}{Style.RESET_ALL}")
    logger.info(f"{Style.BRIGHT}{Fore.BLUE}{sentence}{Style.RESET_ALL}")
    logger.info(f'\n')

end_time = time.time()
logger.info(f'{Style.BRIGHT}{Fore.GREEN}Avg Time: {(end_time - start_time)/len(sentence)}{Style.RESET_ALL}')
logger.info(f'\n')

INFO:root:[1mVerdict: NEUTRAL[0m
INFO:root:[1m[34m[DEBUG] Everything is working properly.
[0m
INFO:root:

INFO:root:[1mVerdict: NEUTRAL[0m
INFO:root:[1m[34m[INFO] Is there a reason for this happening.
[0m
INFO:root:

INFO:root:[1mVerdict: VERY BAD[0m
[0m
INFO:root:

INFO:root:[1mVerdict: VERY BAD[0m
INFO:root:[1m[34m[ERROR] Very dangerous.[0m
INFO:root:

INFO:root:[1m[32mAvg Time: 0.012967161510301672[0m
INFO:root:



### RUN 2

In [135]:
start_time = time.time()
cache = {}
with open("cache", "r") as cfile:
    cache = json.load(cfile)

for filename in filenames:
    fv = int(filename.split(".")[0][len(prefix):])
    if fv == latest:
        sentence_list, sentiment_list = [], []
        with open(f"{filename}", "r") as file:
            for sentence in file:
                sentence_list.append(sentence)
        sentiment_list = [None for _ in range(len(sentence))]
        new_list, new_list_ids = [], []
        for idx, sentence in enumerate(sentence_list):
            hash_key, is_new = str(hash(sentence)), True
            if hash_key in cache:
                for sentenceX, sentiment in cache[hash_key]:
                    if sentence == sentenceX: 
                        sentiment_list[idx] = sentiment 
                        is_new = False
                        break
            if is_new: 
                new_list.append(sentence)
                new_list_ids.append(idx)
        new_results = predict_sentiment(new_list)
        for ptr, ids in enumerate(new_list_ids):
            sentiment_list[ids] = new_results[ptr]
        break 
        
for sentence, verdict in zip(sentence_list, sentiment_list):
    logger.info(f"{Style.BRIGHT}Verdict: {verdict}{Style.RESET_ALL}")
    logger.info(f"{Style.BRIGHT}{Fore.BLUE}{sentence}{Style.RESET_ALL}")
    logger.info(f'\n')

end_time = time.time()
logger.info(f'{Style.BRIGHT}{Fore.GREEN}Avg Time: {(end_time - start_time)/len(sentence)}{Style.RESET_ALL}')
logger.info(f'\n')

INFO:root:[1mVerdict: NEUTRAL[0m
INFO:root:[1m[34m[DEBUG] Everything is working properly.
[0m
INFO:root:

INFO:root:[1mVerdict: NEUTRAL[0m
INFO:root:[1m[34m[INFO] Is there a reason for this happening.
[0m
INFO:root:

INFO:root:[1mVerdict: VERY BAD[0m
[0m
INFO:root:

INFO:root:[1mVerdict: VERY BAD[0m
INFO:root:[1m[34m[ERROR] Very dangerous.[0m
INFO:root:

INFO:root:[1m[32mAvg Time: 0.01152280102605405[0m
INFO:root:



### Expanding cache

In [136]:
for sentence, verdict in zip(new_list, new_results):
    hash_key = str(hash(sentence))
    if hash_key not in cache:
        cache[hash_key] = []
    cache[hash_key].append((sentence, verdict))

with open("cache", "w") as cfile:
    json.dump(cache, cfile, indent = 5)
    new_list, new_results = [], []