In [1]:
import re
import pandas as pd
import os
from nltk.tokenize import sent_tokenize  # Import the sentence tokenizer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
def remove_urls(sentences_list):
    cleaned_sentences = []
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    for sentence in sentences_list:
        # Tokenize each sentence into words and check if it contains a URL
        words = sentence.split()
        if not any(re.search(url_pattern, word) for word in words):
            cleaned_sentences.append(sentence)

    return cleaned_sentences

In [3]:
def remove_invalid_sentences(sentences_list):

    # Check if the sentence contains only numbers, single letters, only punctuation, or specific symbols
    def is_valid(sentence):
        return not re.match(r'^[0-9a-zA-Z]*[a-zA-Z][0-9a-zA-Z]*$', sentence) \
               and not re.search(r'\b\d{1,3}(,\d{3})*\b|\b\d{4}\b', sentence) \
               and '£' not in sentence

    valid_sentences = [sentence for sentence in sentences_list if is_valid(sentence)]

    return valid_sentences
# Call the function to remove invalid sentences
# clean_sentences = remove_invalid_sentences(sentences_lower)

# Print the valid sentences
# print("Valid Sentences:")
# for i, clean_sentence in enumerate(clean_sentences):
#     print(f" {i+1}: {clean_sentence}")

In [4]:
def remove_empty_lists(input_list):
    
    return [sublist for sublist in input_list if sublist]

# clean_sentences = remove_empty_lists(clean_sentences)

# for i, clean_sentence in enumerate(clean_sentences):
#     print(f" {i+1}: {clean_sentence}")

In [5]:
!pip install sentence-transformers



In [6]:
from sentence_transformers import SentenceTransformer

def sentence_embedding(sentences, model_name='paraphrase-MiniLM-L6-v2'):

    # Load pre-trained Sentence-BERT model
    model = SentenceTransformer(model_name)

    # Encode sentences to get embeddings
    embeddings = model.encode(sentences)

    return embeddings


# embeddings_result = sentence_embedding(clean_sentences)

In [7]:
# Step 3: Create a graph using cosine similarity
def create_similarity_graph(embeddings):
    G = nx.Graph()
    num_sentences = len(embeddings)

    # Convert numpy arrays to PyTorch tensors
    embeddings = [torch.from_numpy(embedding) for embedding in embeddings]

    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            # Convert embeddings to PyTorch tensors
            emb_i, emb_j = embeddings[i], embeddings[j]

            # Calculate cosine similarity between embeddings
            similarity_score = cosine_similarity(emb_i.reshape(1, -1), emb_j.reshape(1, -1))[0, 0]

            # Add an edge to the graph with similarity score as weight
            G.add_edge(i, j, weight=(1-similarity_score))

    return G

# graph = create_similarity_graph(embeddings_result)


In [8]:
# Step 4: Rank sentences by degree centrality
def rank_sentences(graph):
    sum_of_weights = {}

    for node in graph.nodes():
        sum_weight = sum(data['weight'] for _, _, data in graph.edges(node, data=True))
        sum_of_weights[node] = sum_weight

    return sum_of_weights

# ranked_sentences = rank_sentences(graph)

In [9]:
# Step 5: Keep a certain number of sentences to create the summary
def generate_summary(sentences, ranked_sentences, x):
    top_indices = list(ranked_sentences.keys())[:x]

    # Select sentences from the list based on the top indices
    selected_sentences = [{"index": index, "sentence": sentences[index]} for index in top_indices]
    sorted_sentences = sorted(selected_sentences, key=lambda x: x["index"])
    summary = [item["sentence"] for item in sorted_sentences]
    return summary

In [10]:
def save_sentences_to_file(sentences, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        print(f"Sentences saved to {file_path} successfully.")
    except Exception as e:
        print(f"Error: {e}")

In [11]:
import os

folder_path = "/home/student/swoyam/NLP Dataset\/s"
output_directory = "/home/student/swoyam/NLP Dataset\/ss"


try:
    # List all files in the specified folder
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

    # Iterate through each file
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        
        with open(file_path, 'r') as file:
            file_lines = file.readlines()
            sentences = [line.strip() for line in file_lines]
            sentences_lower = remove_urls(sentences)
            clean_sentences = remove_invalid_sentences(sentences_lower)
            clean_sentences = remove_empty_lists(clean_sentences)
            embeddings_result = sentence_embedding(clean_sentences)
            graph = create_similarity_graph(embeddings_result)
            ranked_sentences = rank_sentences(graph)
            sorted_ranked_sentences = dict(sorted(ranked_sentences.items(), key=lambda x: x[1]))
            summary = generate_summary(clean_sentences, sorted_ranked_sentences, 380)
            
            output_file_name = f"fns{file_name.split('.')[0]}_{file_name}"

            # Construct the full path for the output file
            output_file_path = os.path.join(output_directory, output_file_name)
            
            save_sentences_to_file(summary, output_file_path)
            

except FileNotFoundError:
    print(f"Folder {folder_path} not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Sentences saved to /home/student/swoyam/NLP Dataset\/ss/fns92_92.txt successfully.
