In [None]:
import autorootcwd
import chromadb
import pandas as pd
import os
import json
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
from tqdm import tqdm

load_dotenv()

In [2]:
entities = pd.read_csv('data/processed_dataset/ambivalent/ner.csv')
entities = entities[entities['label'].isin(['PER', 'ORG'])].dropna()
entities

Unnamed: 0,article_id,start_pos,end_pos,name,label,confidence
0,a5164fe0-f670-4168-9ca8-3ce023d42fed,483,516,Upper Austrian Chamber of Doctors,ORG,0.864500
1,a5164fe0-f670-4168-9ca8-3ce023d42fed,605,608,OÖN,ORG,0.999800
2,a5164fe0-f670-4168-9ca8-3ce023d42fed,828,863,"Wolfgang Ziegler,Kurienobmann-Stell",PER,0.979083
3,a5164fe0-f670-4168-9ca8-3ce023d42fed,913,935,OÖ Medical Association,ORG,0.945547
4,a5164fe0-f670-4168-9ca8-3ce023d42fed,1170,1189,mayorChristian Graf,PER,0.859976
...,...,...,...,...,...,...
32964,8e6c9db9-8ca1-421d-9923-c3ff77243ce1,2105,2136,House of Digitization Hergovich,ORG,0.981821
32965,8e6c9db9-8ca1-421d-9923-c3ff77243ce1,2699,2708,Hergovich,PER,0.999710
32966,8e6c9db9-8ca1-421d-9923-c3ff77243ce1,2719,2723,SSPÖ,ORG,0.988402
32967,8e6c9db9-8ca1-421d-9923-c3ff77243ce1,2891,2894,ÖVP,ORG,0.999398


In [4]:
client = chromadb.HttpClient(host='localhost', port=8000)

In [5]:
hugging_face_key = os.environ['HUGGING_FACE_KEY']
embed_function = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=hugging_face_key,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
collection = client.create_collection(name="entities_ambivalent", embedding_function=embed_function, metadata={"hnsw:space": "cosine"})

In [None]:
documents = []
metadatas = []
ids = []

for _, row in tqdm(entities.iterrows()):
    article_id = str(row["article_id"])
    name = row["name"]
    metadata = {
        "article_id": article_id,
        "from": row["start_pos"],
        "to": row["end_pos"],
        "label": row["label"],
    }

    documents.append(name)
    metadatas.append(metadata)
    ids.append(f"{article_id}_{row['start_pos']}_{row['end_pos']}")

In [None]:
import time
for i in range(0, len(ids), 1000):
    collection.add(
        documents=documents[i:i+1000],
        metadatas=metadatas[i:i+1000],
        ids=ids[i:i+1000]
    )
    print(f'Added {i} entities')

In [57]:
collection = client.get_collection("entities_negative")

In [20]:
collection = client.get_collection('entities_ambivalent')
sentiments_df = pd.read_csv("data/processed_dataset/ambivalent/sentiments.csv")
output = []
errors = []
for index, row in tqdm(sentiments_df.iterrows()):
    try:
        article_id = row["id"]
        actor = row["actor"]
        sentiment = row["sentiment"]
        query_result = collection.query(
            query_texts=[actor],
            n_results=3,
            where={"article_id": article_id}
        )

        mentions = []
        for entity, metadata, distances in zip(query_result["documents"][0], query_result["metadatas"][0], query_result["distances"][0]):
            if distances < 0.32:
                mention = {
                    "from": metadata["from"],
                    "to": metadata["to"],
                    "mention": entity
                }
                mentions.append(mention)

        if mentions:
            polarity = 0
            if sentiment == "positive":
                polarity = 1
            elif sentiment == "negative":
                polarity = -1
            elif sentiment == "ambivalent":
                polarity = 2

            output_item = {
                "id": article_id,
                "name": actor,
                "polarity": polarity,
                "mentions": mentions
            }
            output.append(output_item)
    except:
        errors.append(index)
        continue

232it [01:01,  3.47it/s]

In [9]:
import json
with open('data/processed_dataset/ambivalent/output_ambivalent.txt', 'w', encoding='utf-8') as file:
    for item in output:
        json_string = json.dumps(item, ensure_ascii=False)
        file.write(json_string + '\n')

In [None]:
from source.output_formatter import OutputFormatter
output_formatter = OutputFormatter()
articles_df = pd.read_csv("data/processed_dataset/ambivalent/articles.csv", header=None)

In [16]:
from source.output_formatter import OutputFormatter
output_formatter = OutputFormatter()
output_path = "data/processed_dataset/ambivalent/output_ambivalent.txt"
articles_path ="data/processed_dataset/ambivalent/articles.csv"
# articles_df.columns = ['article_id', 'text_german', 'text_english']
d1 = output_formatter.adapt_output_format_for_absa_pytorch(output_path, articles_path)
d2 = output_formatter.adapt_output_format_for_newsmtsc(output_path, articles_path)

In [19]:
with open("data/processed_dataset/ambivalent/ambivalent_absa_pytorch.jsonl", "w") as f:
    for item in d1:
        f.write(json.dumps(item) + "\n")

In [5]:
def adapt_output_format(output, articles_df):
    adapted_output = []
    for item in output:
        article_id = item["id"]
        if articles_df[articles_df["article_id"] == article_id].empty:
            continue
        sentence_normalized = articles_df.loc[articles_df["article_id"] == article_id, "text_english"].values[0]
        sorted_mentions = sorted(item['mentions'], key=lambda x: x['from'])
        target = {
            "Input.gid": f"{article_id}_{mention['from']}_{mention['to']}_{sorted_mentions[0]['mention']}",
            "from": mention["from"],
            "to": mention["to"],
            "mention": mention["mention"],
            "polarity": item["polarity"],
            "further_mentions": [
                {
                    "from": m["from"],
                    "to": m["to"],
                    "mention": m["mention"]
                }
                for i, m in enumerate(sorted_mentions) if i > 0
            ]
        }
        
        adapted_item = {
            "primary_gid": f"{article_id}_{item['mentions'][0]['from']}_{item['mentions'][0]['to']}_{item['mentions'][0]['mention']}",
            "sentence_normalized": sentence_normalized,
            "targets": [target]
        }
        adapted_output.append(adapted_item)
    
    return adapted_output

In [8]:
import json

articles_df = pd.read_csv("data/processed_dataset/negative/articles.csv", header= None)
articles_df.columns = ['article_id', 'text_german', 'text_english']
adapted_output = adapt_output_format(output=output, articles_df = articles_df)

with open("data/processed_dataset/negative/negative.jsonl", "w") as f:
    for item in adapted_output:
        f.write(json.dumps(item) + "\n")
