### Google Colab Drive Mounting and Directory Change:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change directory to the project directory

import os
os.chdir('/content/drive/MyDrive/')

### Installation of Packages

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install evaluate


### Data Processing and Experiment Automation Functions:

In [None]:
import os.path as osp
import re
import datetime
from collections import Counter

import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, \
    TrainingArguments
from datasets import Dataset
import evaluate

from masked_absa_concat_model import MaskedABSAConcatModel


def tokenize(example, tokenizer):
    # Tokenization function for text data
    st = list(map(lambda e: re.sub(re.escape(e[1]), '[MASK]', e[0]), zip(example['Subtext'], example['Entity'])))

    return tokenizer(st, return_token_type_ids=True, padding='max_length', truncation=True)


def map_label(example, labels_dict):
    # Maps labels to numerical values
    example['label'] = labels_dict[example['label']]
    return example


def lowercase(example):
    # Converts text to lowercase
    example['Subtext'] = example['Subtext'].lower()
    example['Entity'] = example['Entity'].lower()
    return example


def remove_special_characters(example):
    # Removes special characters from text
    example['Subtext'] = re.sub(r'[®°™£]', '', example['Subtext'])
    example['Entity'] = re.sub(r'[®°™£]', '', example['Entity'])
    return example


def remove_whitespace_code(example):
    # Removes specific whitespace characters
    example['Subtext'] = re.sub(r'_x000D_', '', example['Subtext'])


def load_and_process_data(ds_type, label_name, tokenizer, cased=False) -> Dataset:
    """
        Data loading and preprocessing. Does lowercase if necessary, label mapping, special character removal
        and tokenization
        @param ds_type: dataset type (train, validation, test)
        @param use_QA: flag whether to use QA or just target for second sequence
        @param label_name: original name of the label column
        @param tokenizer: pretrained tokenizer
        @param cased: whether the model was trained on cased or uncased text
        @return: processed HuggingFace dataset
        """
    if ds_type == 'train':
        df = pd.read_csv("dataset/train_dataset.csv", sep=';', encoding='utf-8')
        y = df[label_name]
        X = df.drop([label_name], axis=1)
        X[label_name] = y
        dataset = Dataset.from_pandas(X)
    else:
        dataset = Dataset.from_csv(f'dataset/{ds_type}_dataset.csv', sep=';', encoding='utf-8')

    #if not cased:
        #dataset = dataset.map(lowercase)
    dataset = dataset.rename_column(original_column_name=label_name, new_column_name='label')
    labels_dict = {'negativ': 0, 'neutral': 1, 'positiv': 2, 'ambivalent': 1}   # ambivalent class removed, samples merged with neutral
    dataset = dataset.map(lambda e: map_label(e, labels_dict))
    dataset = dataset.map(remove_whitespace_code)
    dataset = dataset.map(remove_special_characters)
    dataset = dataset.filter(lambda e: e['Entity'] in e['Subtext'])
    dataset = dataset.map(lambda s: tokenize(s, tokenizer), batched=True)

    return dataset




def experiments_automation() -> None:
    """
        Automates the experiments so multiple experiments can be run one after the other.
    """
    checkpoint_configs = [('bert-base-german-cased', True, 16, 4)]
    model_types = ['concat']
    experiment_name = 'target_sentiment_DE'
    for model_type in model_types:
        for checkpoint, cased, batch_size, accumulation_steps in checkpoint_configs:
            training_experiment(checkpoint=checkpoint,
                                experiment_name=experiment_name,
                                cased=cased,
                                batch_size=batch_size,
                                accumulation_steps=accumulation_steps,
                                use_custom_model=True,
                                model_type=model_type)


### Data Loading, Processing, and Model Preparation:

In [None]:
from sklearn.metrics import f1_score
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

cased=True
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
train = load_and_process_data('train', label_name='Sentiments', tokenizer=tokenizer, cased=cased)
label_counts = Counter(train['label'])
num_labels=3
class_weights = [max(label_counts.values()) / label_counts[cls] for cls in sorted(set(train['label']))]
cls_id, mask_id, sep_id = tokenizer.encode('[MASK]')


### Best Model Loading and Initialization:

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the trained model
model_path = "models/bestmodel.pth"  # Adjust the path accordingly

model = MaskedABSAConcatModel('bert-base-german-cased', num_labels=num_labels, class_weights=class_weights,cls_id=cls_id, sep_id=sep_id, mask_id=mask_id)
model.load_state_dict(torch.load("models/bestmodel_state_dict.pth"))
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")




### Tokenization Function Update:

In [None]:
def tokenize(example, tokenizer):
    st = list(map(lambda e: re.sub(re.escape(e[1]), '[MASK]', e[0], flags=re.IGNORECASE), zip(example['Subtext'], example['Entity'])))

    return tokenizer(st, return_token_type_ids=True, padding='max_length', truncation=True)


### Function to Make Predictions:

In [None]:

def make_predictions(model, tokenized_dataset):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    tokenized_dataset.set_format('torch')
    dataloader = DataLoader(tokenized_dataset, batch_size=64)
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'token_type_ids', 'attention_mask']}

            try:
                outputs = model(**inputs)
                logits = outputs.logits
                predictions.extend(logits.argmax(dim=1).cpu().numpy())
            except Exception as e:
                print(f"Error during prediction: {e}")
                #print(f"Inputs: {inputs}")
                #print(f"Outputs: {outputs}")
    return predictions



### Data Preprocessing, Tokenization, and Prediction Pipeline:

In [None]:
import os
from datasets import Dataset
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

# Function to preprocess each element in the dataset
def preprocess_data(new_dataset):
    #new_dataset = Dataset.from_csv("Aldi_2020.csv")
    new_dataset=new_dataset.map(remove_whitespace_code)
    new_dataset=new_dataset.map(remove_special_characters)
    new_dataset=new_dataset.filter(lambda e: e['Entity'] in e['Subtext'])
    new_dataset= new_dataset.map(lambda s: tokenize(s, tokenizer), batched=True)
    return new_dataset

# Function to tokenize the dataset
def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda s: tokenize(s, tokenizer), batched=True)

# Function to make predictions using the given model and tokenized dataset
def make_predictions(model, tokenized_dataset):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    tokenized_dataset.set_format('torch')
    dataloader = DataLoader(tokenized_dataset, batch_size=64)
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'token_type_ids', 'attention_mask']}
            try:
                outputs = model(**inputs)
                logits = outputs.logits
                predictions.extend(logits.argmax(dim=1).cpu().numpy())
            except Exception as e:
                print(f"Error during prediction: {e}")
                #print(f"Inputs: {inputs}")
    return predictions

# Function to save predictions along with the original text to a new CSV file
def save_predictions_to_csv(original_dataset, predictions, output_csv):
    predictions_df = pd.DataFrame({'predictions': predictions})
    new_dataset_df = original_dataset.to_pandas()
    relevant_columns = ['Entity', 'Subtext','mediumName/Title']
    new_dataset_df = new_dataset_df[relevant_columns]
    result_df = pd.concat([new_dataset_df, predictions_df], axis=1)
    result_df.to_csv(output_csv)

# Specify the folder containing the CSV files
folder_path = "news/news_2020"

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)

    # Load dataset from CSV file
    #new_dataset = Dataset.from_csv(file_path)
    df = pd.read_csv(file_path)

    # Convert all values in the DataFrame to strings
    df = df.astype(str)

    new_dataset = Dataset.from_pandas(df)

    # Preprocess the dataset
    new_dataset = preprocess_data(new_dataset)

    # Tokenize the dataset
    new_dataset = tokenize_dataset(new_dataset, tokenizer)

    # Make predictions
    predictions = make_predictions(model, new_dataset)

    # Save predictions along with the original text to a new CSV file
    output_csv = f"predictions_{csv_file}"
    save_predictions_to_csv(new_dataset, predictions, output_csv)
    print(f"Predictions saved to {output_csv}")
