In [None]:
#pip install torch torchvision
#pip install transformers

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity
import re
from tqdm.auto import tqdm
import os

In [2]:
data = pd.read_csv("C:/Users/Thodoris/Documents/IR_Climate_Change/files/articles_V2.csv")
data.head()

Unnamed: 0,link,title,date,category,summary,main_text,headers
0,https://news.un.org//en/story/2023/10/1142562,"Somalia: Insecurity worsens, civilians pay the...",19 October 2023,Peace and Security,"Insecurity persists in Somalia, with extremist...","Briefing Ambassadors in New York, UN Special R...","Women, Peace and Security  Humanitarian crisi..."
1,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 October 2023,Climate and Environment,Sand and dust storms are increasingly threaten...,The UN World Meteorological Organization (WMO)...,Hundreds killed and injured
2,https://news.un.org//en/story/2023/10/1142317,"Cutting disaster risk will boost equality, imp...",13 October 2023,Climate and Environment,Marking the International Day for Disaster Ris...,"According to various estimates, up to 75 per c...",
3,https://news.un.org//en/story/2023/10/1142302,Tackling disasters means safer and fairer futu...,13 October 2023,Climate and Environment,Reducing the risk of disasters will not only s...,"Earthquakes, hurricanes, volcanic activity and...",UN Resident Coordinator:
4,https://news.un.org//en/story/2023/10/1142227,WMO: Global stocktake of water resources needed,12 October 2023,Climate and Environment,The hydrological cycle is spinning out of bala...,Destructive droughts and heavy rains are causi...,


In [3]:
# 1) Handling Missing Values and Dropping Columns
data.dropna(subset=['summary', 'main_text'], inplace=True)  # Removing rows with missing 'summary' or 'main_text'
data.drop(columns=['headers'], inplace=True)        # Dropping 'link' and 'headers' columns

# 2) Text Normalization
def clean_text(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-printable characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    return text

data['title'] = data['title'].apply(clean_text)
data['summary'] = data['summary'].apply(clean_text)
data['main_text'] = data['main_text'].apply(clean_text)
data['category'] = data['category'].apply(clean_text)

# 3) Only keep Climate and Environment Category

data = data[data['category'] == 'climate and environment']

# Displaying the cleaned data
data.head()


Unnamed: 0,link,title,date,category,summary,main_text
1,https://news.un.org//en/story/2023/10/1142552,world news in brief sandstorm alert albinism a...,19 October 2023,climate and environment,sand and dust storms are increasingly threaten...,the un world meteorological organization wmo s...
2,https://news.un.org//en/story/2023/10/1142317,cutting disaster risk will boost equality impr...,13 October 2023,climate and environment,marking the international day for disaster ris...,according to various estimates up to 75 per ce...
3,https://news.un.org//en/story/2023/10/1142302,tackling disasters means safer and fairer futu...,13 October 2023,climate and environment,reducing the risk of disasters will not only s...,earthquakes hurricanes volcanic activity and o...
4,https://news.un.org//en/story/2023/10/1142227,wmo global stocktake of water resources needed,12 October 2023,climate and environment,the hydrological cycle is spinning out of bala...,destructive droughts and heavy rains are causi...
5,https://news.un.org//en/story/2023/10/1142212,in iceland un deputy chief says all conflict ...,11 October 2023,climate and environment,deputy secretarygeneral amina mohammed has wra...,during her visit which concluded on tuesday ms...


In [4]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings with truncation for long texts
def get_bert_embedding(text):
    # Tokenize and truncate the text to fit the token limit
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# Function to save embeddings to a file
def save_embeddings(data, filename):
    torch.save(data, filename)

# Function to load embeddings from a file
def load_embeddings(filename):
    return torch.load(filename)

# Specify the file path
embeddings_file = r'C:\Users\Thodoris\Documents\IR_Climate_Change\files\new_articles_embeddings.pt'

# Check if the embeddings file exists
if not os.path.isfile(embeddings_file):
    tqdm.pandas(desc="Computing Embeddings")
    embeddings = data['main_text'].progress_apply(get_bert_embedding)
    save_embeddings(embeddings, embeddings_file)
else:
    embeddings = load_embeddings(embeddings_file)


In [5]:
# Function to compute cosine similarity and return a numerical value
def compute_similarity(embedding1, embedding2):
    cos_sim = cosine_similarity(embedding1, embedding2)
    return cos_sim.item()  # Extracts the numerical value from the tensor

# Search function
def search(query, embeddings):
    query_embedding = get_bert_embedding(query)
    similarities = embeddings.apply(lambda emb: compute_similarity(emb, query_embedding))
    similarities = pd.Series(similarities).astype(float)
    most_similar_texts = data.loc[similarities.nlargest(5).index]
    return most_similar_texts

In [16]:
# Example usage
query = "forest"
search_results = search(query, embeddings)

In [15]:
print(search_results)

                                               link  \
602   https://news.un.org//en/story/2021/08/1098212   
696   https://news.un.org//en/story/2021/03/1087592   
1143  https://news.un.org//en/story/2018/11/1025591   
459   https://news.un.org//en/story/2021/12/1108272   
607   https://news.un.org//en/story/2021/08/1097912   

                                                  title              date  \
602   from the field extinguishing the practice of c...    22 August 2021   
696             from the field saving corals in crisis      19 March 2021   
1143  united states un chief deeply saddened by dead...  13 November 2018   
459   champions of the earth mia mottley versus the ...  23 December 2021   
607   from the field human damage on the frontline o...    16 August 2021   

                     category  \
602   climate and environment   
696   climate and environment   
1143  climate and environment   
459   climate and environment   
607   climate and environment   

       

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "The UN World Meteorological Organization (WMO) said in a new report on Thursday that every year, around 2,000 million tonnes of dust enter the atmosphere, â€œdarkening skies and harming air quality in regions that can be thousands of kilometres awayâ€.   WMO chief Petteri Taalas said this was partly due to poor water and land management. The phenomenon was also exacerbated by higher temperatures and drought brought on by a warming climate, leading to higher evaporation and drier soils.   WMO said that exposure to dust particles has been associated with heart attacks, cardiovascular disease and lung cancer. Sand and dust storms also pose risks to aviation and ground transportation as well as agriculture.   According to WMO in 2022, hotspots with significantly higher dust concentrations were identified in Central and South America, most of Central Africa, Spain, the Red Sea, the Arabian Peninsula, as well as in Iran, south Asia and northwest China.   Prof. Taalas stressed WMOâ€™s commitment to help countries improve dust storm forecasting skills and early warning services. He also underscored that more needed to be done in the face of continuing environmental degradation and fast-advancing climate change.   And staying with climate change: its impacts on skin cancer in people with albinism are both deadly and largely overlooked, a UN-appointed independent rights expert said on Thursday.   Muluka-Anne Miti-Drummond, the Special Rapporteur on albinism issues, said that in Africa alone, persons with albinism are up to 1,000 times more likely to develop skin cancer, with many dying by the age of 40.   She underscored she has campaigned tirelessly for sunscreen to be made freely available to persons with albinism, as a â€œlife-saving medical product that can prolong and improve the quality of life for many who don't have the means to afford itâ€.   People with albinism also have visual impairment, the expert said, and as such are disproportionately affected by climate-related disasters.   Ms. Miti-Drummond called for the inclusion of people with albinism in all fora related to climate change and disaster management, insisting that for many of them, climate change is â€œa matter of life and deathâ€.   Peruvian authorities must undertake meaningful reforms to ensure human rights are respected during protests and demonstrations, following an alarming increase in the use of force.   Independent human rights experts issued the alert in a new report on Thursday calling for â€œdecentralized and inclusive national dialogueâ€.   The report focuses on the conduct of security forces during nationwide protests between December 2022 and March 2023.   It concludes that Peruvian authorities unduly restricted demonstratorsâ€™ human rights.   Security forces used unnecessary and disproportionate force, including lethal force, outside of the circumstances permitted by international human rights standards, the report states.   It also documents the use of less lethal weapons, incompatible with international standards, that resulted in protesters being seriously and - in some cases - fatally injured.   Rights office OHCHR, recorded that 50 people were killed and 821 injured in the context of protests from 7 December to 31 March, allegedly by security forces. some 208 members of the security forces were injured.   Criminal investigations were opened against 241 people who took part in the demonstrations. Of these, at least 221 have since been closed due to lack of evidence. This includes 192 people who had been arrested in San Marcos University in Lima on 21 January.   In April 2023, the authorities set up a dedicated team within the Public Prosecutorâ€™s Office to investigate alleged crimes committed in the context of the protests.   â€œThose responsible for human rights violations must be held accountable, through fair judicial proceedings,â€ UN rights chief Volker TÃ¼rk said. â€œAccountability is paramount if we want to start healing wounds and creating trust in the institutions of the State. Victims should be provided with full reparation.â€   â€œIt is paramount that the grievances and concerns across the whole spectrum of the Peruvian society are addressed. An inclusive national dialogue is needed.This is the only way forward. Everyone needs to feel heard and represented in society to stop endless political and social crises,â€ Mr. TÃ¼rk said"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Check the token count
print("Total tokens:", len(tokens))

In [None]:
df.to_csv('../files/preprocessed_articles.csv', index=False)