In [1]:
#Step 1: Import Required Libraries
import pandas as pd
import json
import joblib
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import spacy



In [2]:
#Step 2: Load ESG Dataset
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ESG_daily_news.csv')
df=df.dropna(subset=['headline', 'text', 'Date']).reset_index(drop=True)
display(df)

Unnamed: 0,Date,headline,text
0,2022-11-28,Top-Ranked Hedge Fund Makes Contrarian Bet on ...,As most techology stocks reel from higher inte...
1,2022-11-27,Deutsche Bank’s DWS CEO Mulls New Legal Setup,DWS Group CEO Stefan Hoops is considering chan...
2,2022-11-24,"JPMorgan, Deutsche Bank Sued by Epstein Accusers",JPMorgan Chase & Co. and Deutsche Bank AG were...
3,2022-11-23,Tech Job Cuts Increase ‘Anxiety’ Across Industry,"After years of exuberant growth and hiring, la..."
4,2022-11-22,"Amundi, DWS Reclassify Funds in Major Industry...",Amundi and Deutsche Bank’s DWS Group are downg...
...,...,...,...
339,2021-10-28,Citi Pitches $1 Billion Social Bond Amid Race ...,Citigroup Inc. is returning to the social bond...
340,2021-10-26,Jet Fuel Surges in Price as Travel Restriction...,Jet fuel is back in a big way. The oil product...
341,2021-10-25,Rich Nations Fail to Meet Climate Target Befor...,Rich countries have failed to meet their pledg...
342,2021-10-24,Negotiators Edge Closer to Global Carbon Marke...,Nations are edging toward a deal that might cr...


In [3]:
#Install Dependancies
!pip install transformers
!pip install torch



In [4]:
#Preview Dataset
display(df)

Unnamed: 0,Date,headline,text
0,2022-11-28,Top-Ranked Hedge Fund Makes Contrarian Bet on ...,As most techology stocks reel from higher inte...
1,2022-11-27,Deutsche Bank’s DWS CEO Mulls New Legal Setup,DWS Group CEO Stefan Hoops is considering chan...
2,2022-11-24,"JPMorgan, Deutsche Bank Sued by Epstein Accusers",JPMorgan Chase & Co. and Deutsche Bank AG were...
3,2022-11-23,Tech Job Cuts Increase ‘Anxiety’ Across Industry,"After years of exuberant growth and hiring, la..."
4,2022-11-22,"Amundi, DWS Reclassify Funds in Major Industry...",Amundi and Deutsche Bank’s DWS Group are downg...
...,...,...,...
339,2021-10-28,Citi Pitches $1 Billion Social Bond Amid Race ...,Citigroup Inc. is returning to the social bond...
340,2021-10-26,Jet Fuel Surges in Price as Travel Restriction...,Jet fuel is back in a big way. The oil product...
341,2021-10-25,Rich Nations Fail to Meet Climate Target Befor...,Rich countries have failed to meet their pledg...
342,2021-10-24,Negotiators Edge Closer to Global Carbon Marke...,Nations are edging toward a deal that might cr...


In [7]:
#Set Up the NER Pipeline using finbert-ner
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

# Initialize NER pipeline
ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Company extraction function
def extract_companies(text):
    entities = ner_pipe(text)
    orgs = [ent["word"] for ent in entities if ent["entity_group"] == "ORG"]
    return orgs if orgs else ["unknown"]

Device set to use cpu


In [9]:
#Apply to Headlines

from tqdm import tqdm
tqdm.pandas()

df["companies"] = df["headline"].progress_apply(extract_companies)
df["company"] = df["companies"].apply(lambda x: ", ".join(x))
df.to_csv("ESG_news_with_companies.csv", index=False)

100%|██████████| 344/344 [03:11<00:00,  1.80it/s]


In [10]:
#Preview the dataset
display(df)

Unnamed: 0,Date,headline,text,companies,company
0,2022-11-28,Top-Ranked Hedge Fund Makes Contrarian Bet on ...,As most techology stocks reel from higher inte...,[unknown],unknown
1,2022-11-27,Deutsche Bank’s DWS CEO Mulls New Legal Setup,DWS Group CEO Stefan Hoops is considering chan...,"[ Deutsche Bank, DWS]","Deutsche Bank, DWS"
2,2022-11-24,"JPMorgan, Deutsche Bank Sued by Epstein Accusers",JPMorgan Chase & Co. and Deutsche Bank AG were...,"[ JPMorgan, Deutsche Bank]","JPMorgan, Deutsche Bank"
3,2022-11-23,Tech Job Cuts Increase ‘Anxiety’ Across Industry,"After years of exuberant growth and hiring, la...",[unknown],unknown
4,2022-11-22,"Amundi, DWS Reclassify Funds in Major Industry...",Amundi and Deutsche Bank’s DWS Group are downg...,"[ Amundi, DWS]","Amundi, DWS"
...,...,...,...,...,...
339,2021-10-28,Citi Pitches $1 Billion Social Bond Amid Race ...,Citigroup Inc. is returning to the social bond...,[ Citi],Citi
340,2021-10-26,Jet Fuel Surges in Price as Travel Restriction...,Jet fuel is back in a big way. The oil product...,[unknown],unknown
341,2021-10-25,Rich Nations Fail to Meet Climate Target Befor...,Rich countries have failed to meet their pledg...,[unknown],unknown
342,2021-10-24,Negotiators Edge Closer to Global Carbon Marke...,Nations are edging toward a deal that might cr...,[unknown],unknown


In [12]:
# Create a unified content field for embedding
df['content'] = (
    "Company: " + df['company'].fillna('') + "\n"
    "Date: " + df['Date'].astype(str) + "\n"
    "Headline: " + df['headline'].fillna('') + "\n"
    "Text: " + df['text'].fillna('')
)

In [13]:
display(df)

Unnamed: 0,Date,headline,text,companies,company,content
0,2022-11-28,Top-Ranked Hedge Fund Makes Contrarian Bet on ...,As most techology stocks reel from higher inte...,[unknown],unknown,Company: unknown\nDate: 2022-11-28\nHeadline: ...
1,2022-11-27,Deutsche Bank’s DWS CEO Mulls New Legal Setup,DWS Group CEO Stefan Hoops is considering chan...,"[ Deutsche Bank, DWS]","Deutsche Bank, DWS","Company: Deutsche Bank, DWS\nDate: 2022-11-2..."
2,2022-11-24,"JPMorgan, Deutsche Bank Sued by Epstein Accusers",JPMorgan Chase & Co. and Deutsche Bank AG were...,"[ JPMorgan, Deutsche Bank]","JPMorgan, Deutsche Bank","Company: JPMorgan, Deutsche Bank\nDate: 2022..."
3,2022-11-23,Tech Job Cuts Increase ‘Anxiety’ Across Industry,"After years of exuberant growth and hiring, la...",[unknown],unknown,Company: unknown\nDate: 2022-11-23\nHeadline: ...
4,2022-11-22,"Amundi, DWS Reclassify Funds in Major Industry...",Amundi and Deutsche Bank’s DWS Group are downg...,"[ Amundi, DWS]","Amundi, DWS","Company: Amundi, DWS\nDate: 2022-11-22\nHead..."
...,...,...,...,...,...,...
339,2021-10-28,Citi Pitches $1 Billion Social Bond Amid Race ...,Citigroup Inc. is returning to the social bond...,[ Citi],Citi,Company: Citi\nDate: 2021-10-28\nHeadline: Ci...
340,2021-10-26,Jet Fuel Surges in Price as Travel Restriction...,Jet fuel is back in a big way. The oil product...,[unknown],unknown,Company: unknown\nDate: 2021-10-26\nHeadline: ...
341,2021-10-25,Rich Nations Fail to Meet Climate Target Befor...,Rich countries have failed to meet their pledg...,[unknown],unknown,Company: unknown\nDate: 2021-10-25\nHeadline: ...
342,2021-10-24,Negotiators Edge Closer to Global Carbon Marke...,Nations are edging toward a deal that might cr...,[unknown],unknown,Company: unknown\nDate: 2021-10-24\nHeadline: ...


In [15]:
#Generate Embeddings using SentenceTransformer
# Initialize sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

#Create embeddings from content

embeddings=model.encode(df['content'].tolist(), show_progress_bar=True)

#Save embeddings and metadata

joblib.dump(embeddings, 'esg_embeddings.pkl')
joblib.dump(df.to_dict(orient='records'), "esg_metadata.pkl")

print("✅ Embeddings and metadata saved.")

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

✅ Embeddings and metadata saved.
