In [1]:
import re
import pandas as pd
import os
from nltk.tokenize import sent_tokenize  # Import the sentence tokenizer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# **Preprocessing of sentences**

In [2]:
def remove_urls(sentences_list):
    cleaned_sentences = []
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    for sentence in sentences_list:
        # Tokenize each sentence into words and check if it contains a URL
        words = sentence.split()
        if not any(re.search(url_pattern, word) for word in words):
            cleaned_sentences.append(sentence)

    return cleaned_sentences

In [3]:
# Open the text file in read mode
file_path = "/content/drive/MyDrive/fns2020_dataset/training/annual_reports/92.txt"

sentences_lower = []

try:
    with open(file_path, 'r') as file:
        # Read the lines of the file into a list
        file_lines = file.readlines()

        # Remove leading and trailing whitespaces and store in a list
        sentences = [line.strip() for line in file_lines]


        sentences_lower = remove_urls(sentences)


        for i, sentence_lower in enumerate(sentences_lower):
            print(f" {i+1} : {sentence_lower}")

except FileNotFoundError:
    print(f"File {file_path} not found.")
except Exception as e:
    print(f"An error occurred: {e}")


 1 : UBC MEDIA GROUP PLC
 2 : ANNUAL REPORT & ACCOUNTS 2005
 3 : UBC MEDIA GROUP PLC  ANNUAL REPORT AND ACCOUNTS 2005
 4 : UBC Media Group PLC
 5 : 50 Lisson Street
 6 : London NW1 5DF
 7 : T: 020 7453 1600
 8 : F: 020 7723 6132
 9 : 01 Financial highlights
 10 : 02 Chairman’s statement
 11 : 03 Chief Executive’s statement
 12 : 04 Commissioned programming
 13 : 06 Syndicated programming
 14 : 08 Radio stations
 15 : 10 Radio services
 16 : 12 Financial review
 17 : 16 Board of directors
 18 : 16 Directors and advisors
 19 : 17 Report of the directors
 20 : 19 Corporate governance statement
 21 : 21 Statement of directors’ responsibilities
 22 : 22 Remuneration report
 23 : 23 Independent auditors’ report
 24 : 24 Consolidated proﬁt and loss account
 25 : 25 Consolidated balance sheet
 26 : 26 Company balance sheet
 27 : 27 Consolidated cash ﬂow statement
 28 : 28 Notes to the ﬁnancial statements
 29 : 46 Notice of AGM
 30 : *Before goodwill and digital licences
 31 : 01
 32 : -519
 33

In [4]:
def remove_invalid_sentences(sentences_list):

    # Check if the sentence contains only numbers, single letters, only punctuation, or specific symbols
    def is_valid(sentence):
        return not re.match(r'^[0-9a-zA-Z]*[a-zA-Z][0-9a-zA-Z]*$', sentence) \
               and not re.search(r'\b\d{1,3}(,\d{3})*\b|\b\d{4}\b', sentence) \
               and '£' not in sentence

    valid_sentences = [sentence for sentence in sentences_list if is_valid(sentence)]

    return valid_sentences
# Call the function to remove invalid sentences
clean_sentences = remove_invalid_sentences(sentences_lower)

# Print the valid sentences
print("Valid Sentences:")
for i, clean_sentence in enumerate(clean_sentences):
    print(f" {i+1}: {clean_sentence}")


Valid Sentences:
 1: UBC MEDIA GROUP PLC
 2: UBC Media Group PLC
 3: London NW1 5DF
 4: *Before goodwill and digital licences
 5: GROUP TURNOVER
 6: OPERATING PROFIT/(LOSS)*
 7: RADIO SECTOR. UBC’S RADIO STATIONS INCLUDE THE NATIONAL DIGITAL
 8: STATIONS, CLASSIC GOLD DIGITAL AND ONEWORD RADIO. UBC IS ALSO A
 9: LEADING SUPPLIER OF RADIO SERVICES, INCLUDING SOFTWARE TO OPERATE
 10: DIGITAL RADIO DATA SERVICES. IN ADDITION, UBC RANKS AS THE LEADING
 11: INDEPENDENT SUPPLIER OF RADIO PROGRAMMES TO THE BBC AND THE
 12: COMMERCIAL RADIO INDUSTRY.
 13: GOLD DIGITAL
 14: GOLD DIGITAL
 15: UBC MEDIA GROUP PLC
 16: CHAIRMAN’S STATEMENT
 17: John Hodson Chairman
 18: This is the ﬁrst opportunity I have had to write to the shareholders of
 19: UBC Media Group since my appointment as Chairman of the Company in
 20: As the new Chairman I think it is helpful to reiterate the strategy that has
 21: to maximise the returns from UBC’s traditional businesses as a means to ﬁnance
 22: the Group’s digita

In [5]:
def remove_empty_lists(input_list):

    return [sublist for sublist in input_list if sublist]

clean_sentences = remove_empty_lists(clean_sentences)

for i, clean_sentence in enumerate(clean_sentences):
    print(f" {i+1}: {clean_sentence}")

 1: UBC MEDIA GROUP PLC
 2: UBC Media Group PLC
 3: London NW1 5DF
 4: *Before goodwill and digital licences
 5: GROUP TURNOVER
 6: OPERATING PROFIT/(LOSS)*
 7: RADIO SECTOR. UBC’S RADIO STATIONS INCLUDE THE NATIONAL DIGITAL
 8: STATIONS, CLASSIC GOLD DIGITAL AND ONEWORD RADIO. UBC IS ALSO A
 9: LEADING SUPPLIER OF RADIO SERVICES, INCLUDING SOFTWARE TO OPERATE
 10: DIGITAL RADIO DATA SERVICES. IN ADDITION, UBC RANKS AS THE LEADING
 11: INDEPENDENT SUPPLIER OF RADIO PROGRAMMES TO THE BBC AND THE
 12: COMMERCIAL RADIO INDUSTRY.
 13: GOLD DIGITAL
 14: GOLD DIGITAL
 15: UBC MEDIA GROUP PLC
 16: CHAIRMAN’S STATEMENT
 17: John Hodson Chairman
 18: This is the ﬁrst opportunity I have had to write to the shareholders of
 19: UBC Media Group since my appointment as Chairman of the Company in
 20: As the new Chairman I think it is helpful to reiterate the strategy that has
 21: to maximise the returns from UBC’s traditional businesses as a means to ﬁnance
 22: the Group’s digital expansion, incl

# **Preprocessing Completed**

In [6]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m122.9/132.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


# **Sentence Embedding**

In [7]:
from sentence_transformers import SentenceTransformer

def sentence_embedding(sentences, model_name='paraphrase-MiniLM-L6-v2'):

    # Load pre-trained Sentence-BERT model
    model = SentenceTransformer(model_name)

    # Encode sentences to get embeddings
    embeddings = model.encode(sentences)

    return embeddings


embeddings_result = sentence_embedding(clean_sentences)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Creating Similarity Graph**

In [8]:

# Step 3: Create a graph using cosine similarity
def create_similarity_graph(embeddings):
    G = nx.Graph()
    num_sentences = len(embeddings)

    # Convert numpy arrays to PyTorch tensors
    embeddings = [torch.from_numpy(embedding) for embedding in embeddings]

    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            # Convert embeddings to PyTorch tensors
            emb_i, emb_j = embeddings[i], embeddings[j]

            # Calculate cosine similarity between embeddings
            similarity_score = cosine_similarity(emb_i.reshape(1, -1), emb_j.reshape(1, -1))[0, 0]

            # Add an edge to the graph with similarity score as weight
            G.add_edge(i, j, weight=(1-similarity_score))

    return G

graph = create_similarity_graph(embeddings_result)


# **Ranking Sentences**

In [9]:
# Step 4: Rank sentences by degree centrality
def rank_sentences(graph):
    sum_of_weights = {}

    for node in graph.nodes():
        sum_weight = sum(data['weight'] for _, _, data in graph.edges(node, data=True))
        sum_of_weights[node] = sum_weight

    return sum_of_weights

# Calculate the sum of weights for each node
ranked_sentences = rank_sentences(graph)

sorted_ranked_sentences = dict(sorted(ranked_sentences.items(), key=lambda x: x[1]))

# **Summary Generation**

In [10]:
def generate_summary(sentences, ranked_sentences, x):
    top_indices = list(ranked_sentences.keys())[:x]

    # Select sentences from the list based on the top indices
    selected_sentences = [{"index": index, "sentence": sentences[index]} for index in top_indices]
    sorted_sentences = sorted(selected_sentences, key=lambda x: x["index"])
    summary = [item["sentence"] for item in sorted_sentences]
    return summary

In [12]:
summary = generate_summary(clean_sentences, sorted_ranked_sentences, 65)
sentence_string = ' '.join(summary)
print(sentence_string)

the Group’s digital expansion, including the signiﬁcant programme of investment the Financial Review. businesses within the Group. The Company is led by an established management The main ﬁnancial highlights of UBC Media Group for the year to –  Operating Proﬁt for the year before goodwill and digital licences UBC’s Production Division encompasses both the Group’s commissioned independent production company, Smooth Operations, which UBC from the Group’s facilities and studios business as we continue to focus expenses) for the Company. The net proceeds of the placing were As a result of a review of its accounting policies it is UBC Media Group’s the Unique Group of companies, a privately owned media and communications Remuneration committees. Financial Public Relations DIRECTORS AND ADVISORS REPORT OF THE DIRECTORS The principal business of the Group is the ownership and operation of digital and analogue commercial radio stations, radio programming and the provision of Directors and the