# Negraph
Here is the notebook used for the research paper
- produce GPT estimation of UNGC Principle violation info directly from news text based on raw regulation documentation.
- in paper, result is used as the base line

In [1]:
import pandas as pd
import json
import re
import numpy as np
import os
from tqdm import tqdm
from openai import AzureOpenAI


In [2]:
# read ./labeled_data/ungc_label_200.csv
df_label = pd.read_csv('../labeled_data/ungc_label_200.csv')
df_label['human_label'] = [";".join(human_label.replace("[","").replace("]","").split(" ")) for human_label in df_label['human_label']]
df_label.shape

(200, 2)

In [None]:
df_doc = pd.read_parquet('research_paper_data.parquet')
df_doc = df_doc[df_doc['uuid'].isin(df_label['uuid'])][['uuid','published','title','text','json_entities']].copy()
df_doc.shape

(204, 5)

In [None]:
MODEL_NAME = "gpt-4o-mini-research"
MODEL_NAME_STR = MODEL_NAME.replace("-", "_").replace(".", "_")
CHAT_COMPLETION_PARAMS = {
    "model": MODEL_NAME,
    "temperature": 0,
    "top_p": 0,
    "seed": 42,
}

client = AzureOpenAI(
    azure_endpoint=os.getenv('AZURE_ENDPOINT'),
    api_key=os.getenv('AZURE_API_KEY'),
    api_version=os.getenv('AZURE_API_VERSION')
)


In [5]:
query_regulation = """
# Task: Extract the UNGC Principles Violations from News Articles

You are an expert in ESG risk manegement. 
Do not infer or assume meaning that is not explicitly present in the text.

Identify if news articles indicate potential violations in each of the 10 UNGC Principles. 
The goal is to provide a list of the UNGC Principles violated based on news article.

## Important Instructions:
- Output only in the format specified below
- If no relevent information found , return an empty array

## Description of UNGC Principles:
Principle 1 (Human Rights): Businesses should support and respect the protection of internationally proclaimed human rights.
Principle 2 (Human Rights): Make sure that they are not complicit in human rights abuses.
Principle 3 (Labour): Businesses should uphold the freedom of association and the effective recognition of the right to collective bargaining.
Principle 4 (Labour): The elimination of all forms of forced and compulsory labour.
Principle 5 (Labour): The effective abolition of child labour.
Principle 6 (Labour): The elimination of discrimination in respect of employment and occupation.
Principle 7 (Environment): Businesses should support a precautionary approach to environmental challenges.
Principle 8 (Environment): Undertake initiatives to promote greater environmental responsibility.
Principle 9 (Environment): Encourage the development and diffusion of environmentally friendly technologies.
Principle 10 (Anti-Corruption): Businesses should work against corruption in all its forms, including extortion and bribery.

## Expected JSON Output Format
{
    1, 3, 7, 10
}

## Final Notes:
- Prioritize accuracy over quantity of extractions
- You must only match text if it exactly or strongly resembles one of the known patterns.
- Do not infer or assume meaning that is not explicitly present in the text.
- Do not output anything except the matched patterns in the specified format.

# Follow these 2 steps : 
1. Identify the potential violation in each of the 10 UNGC Principles.
2. Output only the numbering of the UNGC Principles in the specified format.
    Ex: If UNPC 1, 3, 7, and 10 are violated, output:
```json
{
    1, 3, 7, 10
}
```
"""

In [6]:
df_doc['ungc_patterns'] = ''
for index, row in tqdm(df_doc.iterrows(), total=len(df_doc)):
    if row['ungc_patterns'] != '' and row['ungc_patterns'] != 'ERROR' and type(row['ungc_patterns']) is str:
        continue
    message_text = [
        {"role": "system",
         "content": query_regulation},
        {"role": "user", "content": f"""
        # Article
        {row['title'] if row['title'] else ''}
        {row['text'] if row['text'] else ''}

        # Entities
        {row['json_entities'] if row['json_entities'] else ''}
        
        # Output
        """}
    ]
    try:
        completion = client.chat.completions.create(
            messages=message_text,
            **CHAT_COMPLETION_PARAMS
            )
        df_doc.at[index, 'ungc_patterns'] = completion.choices[0].message.content
    except Exception as e:
        print(f"Error processing document: {e}")
        df_doc.at[index, 'ungc_patterns'] = 'ERROR'

100%|██████████| 204/204 [01:51<00:00,  1.83it/s]


In [7]:
# convert the result to a string jointed with ";"
df_doc['ungc_patterns'] = df_doc['ungc_patterns'].apply(lambda x: [int(i) for i in re.findall(r'\d+', x)] if isinstance(x, str) else [])

In [15]:
list_labels = []
for i in range(df_doc.shape[0]):
    patterns =  df_doc['ungc_patterns'].iloc[i]
    if len(patterns) == 0:
        list_labels.append(np.nan)
    else:
        list_labels.append(";".join([str(i) for i in patterns]))

In [16]:
df_doc['ungc_labels'] = list_labels

In [17]:
df_doc.shape

(204, 7)

In [18]:
# save the result to a parquet file
df_doc.to_parquet('predicted_research_data_ungc_patterns_oneshot.parquet', index=False)