In [1]:
# install and import the necessary modules
# I found the reference to DistilBert in this Huggingface article: https://huggingface.co/blog/sentiment-analysis-python
# Also on the documentation of Huggingface.co; https://huggingface.co/transformers/v3.0.2/model_doc/distilbert.html
import json
# we need to install Pytorch and transformers since they are not preinstalled
# For this project, we will use sentence transformer and the pretrained distilbert model
# ! pip install torch 
import torch
# ! pip install transformers
from transformers import DistilBertTokenizer, DistilBertModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re

# Renewable Energy Innovations article summary
1. The data is from https://storage.googleapis.com/ds--tasks-datasets/renewable-energy.zip and should be uploaded into the data folder within the same folder of this file to run the code.
2. We will use built in model from DistilBERT for this project
3. According to the definition of sentence transformer, senetence embeddings calculation is optimized in the [sentence transformer models](https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b). Therefore, we will utilize the sentence transformers and the pre-trained DistilBert for this task.
4. Code is inspired from: https://www.pinecone.io/learn/series/nlp/sentence-embeddings/
5. In the project, the key objectives are to extract the key sentences from each article and also compare the similaries and differences between two articles

### First, we need to preprcoess the data and read the file
* The goal is to read each .txt file into a list and remove unncessary special character or empty space
* The data is downloaded and stored in the data folder

In [2]:
# We need to clean the text and remove unncessary special character and whitespace 
# We can also re

def clean_data(text):
    text = text.replace("\n","").replace("##","").replace("*","").strip()
    # remove Sources and hyperlink
    pattern = r'\[[^\]]+\]\(([^)]+)\)'
    text_without_link = re.sub(pattern, '', text)
    text_without_sources = re.sub(r'Sources:.*',"",text_without_link)
    return text_without_sources

In [3]:
# We will read the .txt file by name, clean the data and stored in a list

def read_data_into_list(articleId: str):
    with open(f"data/renewable-energy/{articleId}.txt", "r", encoding="utf-8") as file:
        # read the first line(the header)
        first_line = file.readline()
        # read the remaining of the .txt file
        lines = file.readlines()
    # join the header and the body by "." so that it can be split into list later
    full_article = "".join(first_line) + "." + "".join(lines)
    
    # clean the full_article text file
    #print(clean_data(full_article))
    # check if there is hyperlink by substitue
    article = clean_data(full_article).split(".")
    
    # remove the extracwhite space and empty string in the article 
    clean_article = []
    for i in article:
        if i != "":
            clean_article.append(i.strip())
    #print(clean_article)
    return clean_article


### The next step is to use the sentence-transformers/msmarco-distilbert-base-tas-b model to generate the senetence embeddings and cosine similarity scores
* References for sentence-transformers
* [HuggingFace](https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b)
* [BERT for Sentence Similarity](https://www.pinecone.io/learn/series/nlp/sentence-embeddings)
* There is a depreciated model sentence-transformers/distilbert-base-nli-mean-tokens that HuggingFace does not recommend. [Documentation](https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens)
* We will use the average similarities for a sentence across all senetence to determine the significance for the senetence
* We generate  key sentences from each article(without considering hyperlinks and including headers)

In [4]:
#Load the model according to HuggingFace documentation(https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b)

model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

# After this step, I tested how this model works using the above reference articles in test2()


### Then we implement the extractKeySentences(articleId: str) function to obtain the top 5 most important senetences for each article


In [37]:
# Next, let's implement first function: extractKeySentences(articleId: str)
# This function takes an articleid and returns the key sentence extracted from the article
# References: https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b
# (https://www.pinecone.io/learn/series/nlp/sentence-embeddings)
# I already tested the code in test2() function below so just move things in here

def extractKeySentences(articleId: str):
    
    # First, read in the cleaned sentences
    sentences = read_data_into_list(articleId)

    # Generate embedding
    embeddings = model.encode(sentences)

    # Generate the similarity matrix
    sim = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        sim[i:,i] = util.cos_sim(embeddings[i], embeddings[i:])

    # We need a complete  similarity matrix to calculate the the average similarity of a sentence with regards to all other senetnces
    sim += sim.T - np.diag(sim.diagonal())

    # Calculate the mean for each senetence
    mean_matrix = np.mean(sim, axis =0)

    # rank it based on the highest average similarities with regards to all sentences
    # reference: https://www.geeksforgeeks.org/how-to-use-numpy-argsort-in-descending-order-in-python/
    desc_index = np.argsort(-mean_matrix)

    # decode back to the sentence, get top 5 senetnces
    key_sentences = []
    for i in desc_index[0:5]:
        key_sentences.append(sentences[i-1])
    
    #generate response in the required JSON format
    response ={
        "articleId": articleId,
        "keySentences": key_sentences
    }
    return json.dumps(response)
    




### We then use the above extractKeySentences(articleId: str) function to compare the similarities and differences between two articles.

In [51]:
# 2. Implement compareArticles(articleId1: str, articleId2: str) function
# To compare the similaries and differences between 2 articles, we use the result from the extractKeySentences() function above
# We compute the similarity score for the 5 key sentences between the two articles and rank them based on how similar/different they are
def compareArticles(articleId1: str, articleId2: str):
    #First, we obtain 5 key sentences from the two articles
    key_sentences1 = json.loads(extractKeySentences(articleId1))["keySentences"]
    key_sentences2 = json.loads(extractKeySentences(articleId2))["keySentences"]

    # Then the same steps as the first function, we compute the sentence embedding for the 3 sentences
    # the same references apply
    # First, we encode both group of key senetences
    embeddings1 = model.encode(key_sentences1)
    embeddings2 = model.encode(key_sentences2)

    # Generate the similarity matrix
    sim = np.zeros((len(key_sentences1), len(key_sentences2)))
    for i in range(len(key_sentences1)):
        for j in range(len(key_sentences2)):
            similarity = util.cos_sim(embeddings1[i], embeddings2[j])
            sim[i, j] = similarity
            #print(f"Article 1 Sentence {i+1} has similarity {similarity} with Article 2 Sentence {j+1}")

    #print(sim)
    #print("max_sim",np.unravel_index(np.argmax(sim), sim.shape))
    # The similarity matrix has similar value, so we print out the most similar and most different
    max_index = np.unravel_index(np.argmax(sim), sim.shape)
    similar_sentence = [key_sentences1[max_index[0]],key_sentences2[max_index[1]]]
    min_index = np.unravel_index(np.argmin(sim), sim.shape)
    different_sentence = [key_sentences1[min_index[0]],key_sentences2[min_index[1]]]

    # Generate the response
    response ={
        "articleId1": articleId1,
        "articleId2": articleId2,
        "MostSimilar": similar_sentence,
        "MostDifferent": different_sentence
    }

    return json.dumps(response)
    

## Conclusion
* The similarity comparison between articles works well; however, since the articles have similar topics, the difference between then are minor
* We summarized based on the key senetences because they are the essence of the meaning of the articles. Future works could include summarize based on all senetences and compare similarity and differences.

#### Test
---- This section is for testing the above code

In [6]:
# Test the clean data and read data functions
def test1():
    #articleId = "1"
    #articleId = "5"
    #articleId = "28"
    articleId = "30"
    sentences = read_data_into_list(articleId)
    return sentences

In [7]:
# Looks good with articleID 1, 5, 28, 30
# 30 has Sources and hyperlinks, need to remove them
test1()

['The Global Rise of Renewables: Statistics, Outlook, Challenges, and Opportunities',
 'The global landscape of energy is undergoing a significant transformation, with renewable energy taking center stage',
 "Here's a glimpse into the current trends, future outlook, challenges, and opportunities:Statistics paint a promising picture: Global renewable energy capacity: It has grown exponentially in recent years, surpassing 2,900 gigawatts (GW) in 2023 [1]",
 'Share of global electricity generation: Renewables currently account for over 30%, with projections for it to reach nearly 50% by 2030 [2]',
 'Future outlook: Exponential growth: The International Renewable Energy Agency (IRENA) predicts doubling of global renewable energy capacity by 2030 [3]',
 'Emerging markets lead the charge: Developing countries are expected to rapidly adopt renewables, bypassing traditional fossil fuel infrastructure [4]',
 'Challenges to address: Grid integration: Integrating variable renewable sources like s

In [35]:
# This test is to use the output for the previous cell(clean text) for the sentence transformer model
# tested with articleID 1, 5, 28, 30
# Reference: https://www.pinecone.io/learn/series/nlp/sentence-embeddings/
# https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b
def test2():
    sentences = test1()
    # Generate embedding
    embeddings = model.encode(sentences)
    print(embeddings.shape)
    # Generate the similarity matrix
    sim = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        sim[i:,i] = util.cos_sim(embeddings[i], embeddings[i:])
    print(sim.shape)
    print(sim)
    # a printout test
    # We can sorte this matrix by obtaining the maximun similarity between 

    #flattened_sim = sim.flatten()
    #sorted_indices = np.argsort(flattened_sim)[::-1]
    #for idx in sorted_indices:
        #row = idx // len(sentences)  # Calculate the row index
        #col = idx % len(sentences)   # Calculate the column index
        #if (row != col) and (flattened_sim[idx] != 0.0):
            #similarity = flattened_sim[idx]
            #print(f"Sentence {row} and Sentence {col} have similarity: {similarity}")
    # We need a complete  similarity matrix to calculate the the average similarity with regards to all other senetnces
    sim += sim.T - np.diag(sim.diagonal())
    print(sim)
    # Calculate the mean for each senetence
    mean_matrix = np.mean(sim, axis =0)
    print("mean",mean_matrix)
    # rank it based on the highest average similarities with regards to all sentences
    # reference: https://www.geeksforgeeks.org/how-to-use-numpy-argsort-in-descending-order-in-python/
    desc_index = np.argsort(-mean_matrix)
    print(desc_index[0:5])
    # decode back to the sentence, get top 5 senetnces
    key_sentences = []
    for i in desc_index[0:5]:
        key_sentences.append(sentences[i-1])
    print("key sentences", key_sentences)



In [36]:
test2()

(14, 768)
(14, 14)
[[1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.84058553 0.9999997  0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.86522269 0.82144064 1.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.86214864 0.80927432 0.8461082  0.99999994 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.83005738 0.79084533 0.83754724 0.82878906 0.99999988 0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.83903873 0.79067636 0.793212   0.81566989 0.79433572 1.00000024
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.81585222 0.78366804 0.76487404 0.7595439  0.73025995 0.78751814
  1.00000024 0.         0.         

In [38]:
# Test 3 is to test extractKeySentences() function
# see if the result is the same as test2()
def test3():
    #articleId = "1"
    #articleId = "5"
    #articleId = "28"
    articleId = "30"
    key_sentences = extractKeySentences(articleId)
    print(key_sentences)


In [39]:
test3()

{"articleId": "30", "keySentences": ["While challenges remain, the opportunities for growth are immense, paving the way for a more sustainable and secure energy future", "Job creation: The renewable energy sector is expected to create millions of new jobs globally, contributing to economic growth and development", "Investment opportunities: The transition to renewables presents significant investment opportunities across the entire value chain, attracting private and public capital", "In conclusion, the global adoption of renewable energy is experiencing a significant surge, driven by advancements in technology, growing awareness of climate change, and supportive policies", "Opportunities for growth: Technological advancements: Continued advancements in battery storage, solar panel efficiency, and offshore wind technology will further enhance the economic viability and scalability of renewables"]}


In [60]:
# Test the compareArticles() function
def test4():
    #articleId1 = "1"
    articleId1 = "10"
    articleId2 = "28"
    #articleId2 = "30"
    return compareArticles(articleId1,articleId2)

In [61]:
# Tested article 1 and article 5
# Tested Artcile 10 and 30
# Tested Artcile 10 and 28
test4()

'{"articleId1": "10", "articleId2": "28", "MostSimilar": ["As we delve deeper and leverage smarter systems, geothermal is poised to play a larger role in our clean and sustainable energy future", "Embracing these opportunities through continued technological advancements, supportive policies, and responsible implementation holds the key to unlocking a clean, sustainable, and secure energy future for generations to come"], "MostDifferent": ["Geothermal: Delving Deeper with New Exploration and Systems", "Looking ahead, the outlook is promising: IEA\'s Net Zero by 2050 scenario: This influential roadmap predicts renewables supplying nearly 80% of global electricity by 2050, highlighting their crucial role in achieving climate goals"]}'