# Word embeddings generated by BERT Model.

In [None]:
from transformers import BertModel, AutoTokenizer
import numpy as np
import pandas as pd
import os
import time

# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Read the input DataFrame
df = pd.read_csv('../../../data/processed/light.csv')

# Create the output folder if it doesn't exist
output_folder = "../../../output/bert_embeddings"
os.makedirs(output_folder, exist_ok=True)

start_time = time.time()

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Extract relevant information from the row
    ccode_iso = row['ccode_iso']
    year = row['year']
    text = row['text']

    # Tokenize the text
    tokenized_text = tokenizer.tokenize(text)
    
    # Truncate or pad text to fit within max_seq_length
    truncate_length = len(tokenized_text) - 512 + 2  # +2 to account for [CLS] and [SEP]
    truncated_text = tokenized_text[truncate_length//2 : -truncate_length//2]

    # Add special tokens [CLS] and [SEP], convert tokens to ids, and create attention mask
    marked_text = ["[CLS] "] + truncated_text + [" [SEP]"]
    indexed_tokens = tokenizer.convert_tokens_to_ids(marked_text)
    attention_mask = [1] * len(indexed_tokens)

    # Pad sequences to max_seq_length
    if len(indexed_tokens) < 512:
        indexed_tokens.append(0)
        attention_mask.append(0)

    # Convert lists to PyTorch tensors
    tokens_tensors = torch.tensor([indexed_tokens])
    attention_masks = torch.tensor([attention_mask])

    # Run the BERT model
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)), attention_mask=attention_masks.view(-1, attention_masks.size(-1)))

    # Extract the hidden states and create a DataFrame
    hidden_states = outputs[2][0].squeeze().numpy()
    pd_words = pd.Series(marked_text, name='term')
    df_outputs = pd.DataFrame(hidden_states)
    df_outputs['term'] = pd_words

    # Move 'term' column to the first position
    df_outputs = df_outputs[['term'] + [col for col in df_outputs.columns if col != 'term']]

    # Save the DataFrame to a CSV file
    output_file = os.path.join(output_folder, f'embedding_{ccode_iso}_{year}.csv')
    df_outputs.to_csv(output_file, index=False)

# Record the end time
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f'Total time taken: {total_time} seconds')