# Import Dependencies

In [153]:
import os, torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm

# Set Configurations

In [154]:
# Define the folder structure
final_dataset_folder_name = os.path.join('..', '4) Sentiment Annotation')
model_results_and_actual_data_folder_name = os.path.join("..", "5) BERT Based Models", "Model Results and Actual Data")
predicted_sentiment_shares_file = os.path.join(model_results_and_actual_data_folder_name, "Predicted Sentiment Shares.csv")
model_evaluation_results_folders = {
    "BERT": ["Hyperparameter-1", "Hyperparameter-2", "Hyperparameter-3", "Hyperparameter-4"],
    "DistilBERT": ["Hyperparameter-1", "Hyperparameter-2", "Hyperparameter-3", "Hyperparameter-4"]
}

# Sentiment Mapping
sentiment_map = {
    -1: 'Negative',
     0: 'Neutral',
     1: 'Positive'
}

# Dataset (CSV) Column Names
sentence_column_name = "Sentences"

# To Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define Functions

In [155]:
# Define Function to Retrieve Saved Model Folder
def get_best_accuracy_model():
    # Check best model accuracy and get the predicted dataset
    best_model_folder_path = None
    best_model_accuracy = None
    best_model_name = None
    
    for model_folder in model_evaluation_results_folders:

        model_parameters = []
        
        for hyperparameter_folder in model_evaluation_results_folders[model_folder]:
            folder_path = os.path.join(
                model_results_and_actual_data_folder_name,
                model_folder,
                hyperparameter_folder
            )
            
            # Find prediction files
            unique_file_suffix = " - Test Metric.csv"
            prediction_files = [f for f in os.listdir(folder_path) if f.endswith(unique_file_suffix)]
            
            if len(prediction_files) == 0:
                raise Exception(f'Missing File with "{unique_file_suffix}" suffix in "{folder_path}"')
            elif len(prediction_files) > 1:
                raise Exception(f'File with "{unique_file_suffix}" suffix in "{folder_path}" is duplicated')
                
            # Get the prediction file
            file = prediction_files[0]
            file_path = os.path.join(folder_path, file)
                
            # Load data
            df = pd.read_csv(file_path)
            test_metrics = df.iloc[0].to_dict()

            # Get accuracy
            accuracy = test_metrics['accuracy']
            if accuracy is None:
                raise Exception(f'Missing Test Metric "accuracy" in "{file_path}"')
            
            if best_model_accuracy is None or accuracy > best_model_accuracy:
                best_model_accuracy = accuracy
                best_model_folder_path = folder_path
                best_model_name = model_folder

    # Get saved best model
    unique_folder_suffix = " - Model"
    model_folder = [f for f in os.listdir(best_model_folder_path) if f.endswith(unique_folder_suffix)]
    
    if len(model_folder) == 0:
        raise Exception(f'Missing File with "{unique_folder_suffix}" suffix in "{best_model_folder_path}"')
    elif len(model_folder) > 1:
        raise Exception(f'File with "{unique_folder_suffix}" suffix in "{best_model_folder_path}" is duplicated')
        
    # Get the prediction file
    folder = model_folder[0]
    best_model_path = os.path.join(best_model_folder_path, folder)
    
    # Get Hyperparameter
    best_model_hyperparameter = os.path.basename(best_model_path).split(unique_folder_suffix)[0]

    return best_model_name, best_model_hyperparameter, best_model_path 

# Define Single Sentiment Prediction
def predict_sentiment(model, text, device):
    model.eval()
    
    # Tokenize single sentence
    with torch.no_grad():
        encoding = tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        # Move inputs to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        outputs = model(**encoding)

        # Get prediction
        _, predicted = torch.max(outputs.logits, 1)
        pred = (predicted - 1).cpu().numpy()[0] # Shift labels for model [0, 1, 2] to [-1, 0, 1]

        return int(pred)

# Define Multiple Sentiment Prediction
def predict_sentiments(model, data, device):
    model.eval()
    
    # Create a copy of data to avoid modifying the original
    results_df = data.copy()
    # Add new column for predictions
    results_df['Predicted Sentiment'] = None
    
    with torch.no_grad():
        for idx, row in tqdm(results_df.iterrows(), total=len(results_df), desc="Predicting"):
            # Tokenize single sentence
            encoding = tokenizer(
                row[sentence_column_name],
                truncation=True,
                padding=True,
                max_length=128,
                return_tensors='pt'
            )
            
            # Move inputs to device
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            # Get model predictions
            outputs = model(**encoding)
            
            # Get prediction
            _, predicted = torch.max(outputs.logits, 1)
            pred = (predicted - 1).cpu().numpy()[0] # Shift labels for model [0, 1, 2] to [-1, 0, 1]
            
            # Store prediction in DataFrame
            results_df.at[idx, 'Predicted Sentiment'] = int(pred)
                   
    return results_df

In [156]:
best_model_name, best_model_hyperparameter, best_model_path = get_best_accuracy_model()
is_bert = best_model_name == "BERT"

print('Model Used')
print(f'  Type: {best_model_name}')
print(f'  Hyperparameter: {best_model_hyperparameter}')

Model Used
  Type: BERT
  Hyperparameter: LR 2e-05, E 2, BS 16


In [157]:
# Set BERT/DistilBERT Tokenizer
model_tokenizer = BertTokenizer if is_bert else DistilBertTokenizer

# Load Model Tokenizer
tokenizer = model_tokenizer.from_pretrained(best_model_path)

tokenizer

BertTokenizer(name_or_path='..\5) BERT Based Models\Model Results and Actual Data\BERT\Hyperparameter-1\LR 2e-05, E 2, BS 16 - Model', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [158]:
# Set BERT/DistilBERT Classifier
model_classifier = BertForSequenceClassification if is_bert else DistilBertForSequenceClassification

# Load Model Classifier
model = model_classifier.from_pretrained(best_model_path)

model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [159]:
df = pd.read_csv(os.path.join(final_dataset_folder_name, 'final_dataset.csv'), usecols=['Sentences', 'Final_Sent'])
df = df.rename(columns={'Final_Sent': 'Actual Sentiment'})

show_df = df
show_df['Actual Sentiment'] = df['Actual Sentiment'].replace(sentiment_map)
show_df.head().style.hide_index()

Sentences,Actual Sentiment
"This is not good, you're doing something wrong, but I mean people just see this shit and if they don't want his ass, what the fuck you gonna do for you, what can Trump do for you?",Negative
"Like straight the **** up, you see what he wanna do.",Negative
We don't gotta talk about what he wanna do cause we already know.,Neutral
"But hey, Republican speaking now against the Republican candidate, man, i mean what did Trump do for your ass though: hey, man, I feel like ain't no president ever agenda is to help black people.",Negative
I think Trump go weird because I don't know.,Negative


In [160]:
# Predict Sentiments
predicted_df = predict_sentiments(model, df, device)

show_df = predicted_df
show_df['Actual Sentiment'] = predicted_df['Actual Sentiment'].replace(sentiment_map)
show_df['Predicted Sentiment'] = predicted_df['Predicted Sentiment'].replace(sentiment_map)
show_df.head().style.hide_index()

Predicting: 100%|██████████████████████████████████████████████████████████████████| 5745/5745 [04:37<00:00, 20.74it/s]


Sentences,Actual Sentiment,Predicted Sentiment
"This is not good, you're doing something wrong, but I mean people just see this shit and if they don't want his ass, what the fuck you gonna do for you, what can Trump do for you?",Negative,Negative
"Like straight the **** up, you see what he wanna do.",Negative,Negative
We don't gotta talk about what he wanna do cause we already know.,Neutral,Neutral
"But hey, Republican speaking now against the Republican candidate, man, i mean what did Trump do for your ass though: hey, man, I feel like ain't no president ever agenda is to help black people.",Negative,Negative
I think Trump go weird because I don't know.,Negative,Negative


In [162]:
example_text = """

  Trump is a disgrace in USA
    
""".replace("\n", " ").strip()

example_text_sentiment = sentiment_map[predict_sentiment(model, example_text, device)]

print(f'Text: {example_text}')
print(f'Predicted Sentiment: {example_text_sentiment}')

Text: Trump is a disgrace in USA
Predicted Sentiment: Negative
