In [2]:
!python -m spacy download fr_core_news_lg

Collecting fr-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl (571.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.8/571.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-lg
Successfully installed fr-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
from spacy.lang.fr import French
from spacy.pipeline import EntityRuler
from collections import Counter
from tqdm import tqdm

In [5]:
training_file_path = 'drive/MyDrive/greenwashing/train_dataset/translated_claims_train.csv'  # Replace with your training CSV file path
training_df = pd.read_csv(training_file_path)

# Evaluation dataset with 'text' and 'expected_companies' columns
eval_file_path = 'drive/MyDrive/greenwashing/ner_evaluation/ner_evaluation_dataset.csv'  # Replace with your evaluation CSV file path
eval_df = pd.read_csv(eval_file_path)

In [6]:
training_df.head()

Unnamed: 0,text,translated_text
0,The project will make a significant contributi...,Le projet contribuera de manière significative...
1,"So in -- first of all, the segments, we are --...","Ainsi, dans -- tout d'abord, les segments, nou..."
2,Building a smarter and stronger energy grid th...,La mise en place d'un réseau énergétique plus ...
3,"Generally, first of all our Transmission depar...","D'une manière générale, tout d'abord, notre dé..."
4,"But for the most part, our challenge is showin...","Mais pour l'essentiel, notre défi consiste à l..."


In [7]:
eval_df.head()

Unnamed: 0,text,expected_companies
0,Coca‑Cola en France soutient et encourage la c...,"[""Coca‑Cola""]"
1,"Légers, durables, respectueux de la nature, il...",[]
2,SUEZ permet à ses clients de fournir l’accès à...,"[""SUEZ""]"
3,L’Oréal a adopté ses principes pour promouvoir...,"[""Oréal""]"
4,Nestlé a consenti d’importants efforts pour ré...,"[""Nestlé""]"


In [8]:
# Convert expected_companies from string to list in evaluation dataset
eval_df['expected_companies'] = eval_df['expected_companies'].apply(lambda x: eval(x))

In [14]:
# Load SpaCy model for French
nlp = spacy.load("fr_core_news_lg")
entity_ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
patterns = [
    {"label": "ORG", "pattern": "Apple"},
    {"label": "ORG", "pattern": "Google"},
    {"label": "ORG", "pattern": "Microsoft"},
    {"label": "ORG", "pattern": "Tesla"},
    {"label": "ORG", "pattern": "SpaceX"}
]
entity_ruler.add_patterns(patterns)

def extract_companies(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]

In [16]:
def calculate_accuracy(df):
    correct = 0
    total = len(df)
    for _, row in df.iterrows():
        if set(row['expected_companies']) == set(row['extracted_companies']):
            correct += 1
    return correct / total

In [17]:
# Function to calculate row-wise precision and recall
def calculate_precision_recall(df):
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0

    for _, row in df.iterrows():
        expected = set(row['expected_companies'])
        extracted = set(row['extracted_companies']) if row['extracted_companies'] else set()

        true_positives = len(expected & extracted)
        false_positives = len(extracted - expected)
        false_negatives = len(expected - extracted)

        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives

    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0

    return precision, recall

In [18]:
# Extract companies from the training dataset and store in a new column
training_df['extracted_companies'] = [extract_companies(text) for text in tqdm(training_df['translated_text'], desc="Extracting companies from training data")]

# Save training dataset with extracted companies
training_df.to_csv('drive/MyDrive/greenwashing/ner_evaluation/training_dataset_with_extracted_companies.csv', index=False)
print("Extracted companies from training data saved to training_dataset_with_extracted_companies.csv")

# Extract companies from the evaluation dataset
eval_df['extracted_companies'] = [extract_companies(text) for text in tqdm(eval_df['text'], desc="Extracting companies from evaluation data")]

# Calculate row-wise accuracy
row_wise_accuracy = calculate_accuracy(eval_df)
print(f"Row-wise Accuracy: {row_wise_accuracy:.2f}")

# Calculate row-wise precision and recall
row_wise_precision, row_wise_recall = calculate_precision_recall(eval_df)
print(f"Row-wise Precision: {row_wise_precision:.2f}")
print(f"Row-wise Recall: {row_wise_recall:.2f}")

# Save evaluation dataset with extracted companies
eval_df.to_csv('drive/MyDrive/greenwashing/ner_evaluation/evaluation_output_with_extracted_companies.csv', index=False)
print("Extracted companies from evaluation data saved to evaluation_output_with_extracted_companies.csv")

Extracting companies from training data: 100%|██████████| 2117/2117 [00:31<00:00, 67.03it/s]


Extracted companies from training data saved to training_dataset_with_extracted_companies.csv


Extracting companies from evaluation data: 100%|██████████| 49/49 [00:00<00:00, 91.33it/s]


Row-wise Accuracy: 0.82
Row-wise Precision: 0.96
Row-wise Recall: 0.74
Extracted companies from evaluation data saved to evaluation_output_with_extracted_companies.csv
