In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd

import networkx as nx

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def read_document(file_path):
    """
    Read a document from either a PDF or txt file and extract sentences.
    
    Args:
    file_path (str): Path to the document file.
    
    Returns:
    list: A list of sentences extracted from the document.
    """
    sentences = []
    
    # Read the document
    if file_path.endswith('.pdf'):
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
            sentences = sent_tokenize(text)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = sent_tokenize(text)
    else:
        print("Unsupported file format. Please provide either a PDF or txt file.")
    
    return sentences

In [4]:

def create_embeddings(sentences):
    """
    Create embeddings for a list of sentences using a pre-trained transformer model (BERT).
    
    Args:
    sentences (list): A list of sentences.
    
    Returns:
    torch.Tensor: Embeddings for each sentence.
    """
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Tokenize and convert sentences to input tensors
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
    
    sentence_embeddings = torch.mean(embeddings, dim=1)
    
    return sentence_embeddings

In [5]:
import torch
from torch.utils.data import DataLoader

def create_embeddings_parallel_gpu(sentences):
    """
    Create embeddings for a list of sentences using a pre-trained transformer model (BERT) in parallel on GPU.
    
    Args:
    sentences (list): List of sentences.
    
    Returns:
    torch.Tensor: Sentence-level embeddings.
    """
    # Load pre-trained BERT model
    model = BertModel.from_pretrained('bert-base-uncased').to('cuda')
    
    # Tokenize and convert sentences to input tensors
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to('cuda')
    
    # Create DataLoader for parallel processing
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=16)
    
    # Generate embeddings in parallel on GPU
    embeddings = []
    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
            output = model(input_ids, attention_mask=attention_mask)
            embeddings.append(output.last_hidden_state)
    
    # Concatenate embeddings from batches
    sentence_embeddings = torch.cat(embeddings, dim=0)

    sentence_embeddings = torch.mean(sentence_embeddings, dim=1)
    
    return sentence_embeddings


In [6]:

def calculate_similarity(sentence_embeddings):
    """
    Calculate semantic similarity between pairs of sentences using cosine similarity.
    
    Args:
    sentence_embeddings (torch.Tensor): Sentence embeddings.
    
    Returns:
    torch.Tensor: Similarity matrix between pairs of sentences.
    """
    # Normalize embeddings
    normalized_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    # Calculate cosine similarity matrix
    similarity_matrix = torch.matmul(normalized_embeddings, normalized_embeddings.T)
    
    return similarity_matrix


In [7]:
def visualize_connectivity(sentences, similarity_matrix):
    """
    Create a pandas DataFrame to visualize the connectivity between sentences based on the similarity matrix.
    
    Args:
    sentences (list): List of sentences.
    similarity_matrix (torch.Tensor): Similarity matrix between pairs of sentences.
    
    Returns:
    pd.DataFrame: DataFrame representing the connectivity between sentences.
    """
    # Convert similarity matrix to a numpy array
    similarity_array = similarity_matrix.cpu().numpy()
    
    # Create DataFrame with similarity values
    df = pd.DataFrame(similarity_array, index=sentences, columns=sentences)
    
    return df


In [8]:
# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

def lexical_overlap_similarity(sentences):
    """
    Calculate lexical overlap similarity between pairs of sentences using Jaccard similarity.
    
    Args:
    sentences (list): List of sentences.
    
    Returns:
    numpy.ndarray: Lexical overlap similarity matrix between pairs of sentences.
    """
    # Tokenize sentences
    tokenized_sentences = [set(nlp(sentence.lower())) for sentence in sentences]
    
    # Calculate Jaccard similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i, tokens1 in enumerate(tokenized_sentences):
        for j, tokens2 in enumerate(tokenized_sentences):
            if i != j:
                similarity_matrix[i, j] = len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))
    
    return similarity_matrix


In [9]:
def dependency_parsing_similarity(sentences):
    """
    Calculate syntactic similarity between pairs of sentences using dependency parsing.
    
    Args:
    sentences (list): List of sentences.
    
    Returns:
    numpy.ndarray: Dependency parsing similarity matrix between pairs of sentences.
    """
    # Parse sentences and extract dependency relations
    parsed_sentences = [nlp(sentence) for sentence in sentences]
    dependency_relations = [[(token.dep_, token.head.text.lower()) for token in doc] for doc in parsed_sentences]
    
    # Calculate dependency parsing similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i, relations1 in enumerate(dependency_relations):
        for j, relations2 in enumerate(dependency_relations):
            if i != j:
                # Convert dependency relations to strings
                str_relations1 = ' '.join(['{}-{}'.format(rel[0], rel[1]) for rel in relations1])
                str_relations2 = ' '.join(['{}-{}'.format(rel[0], rel[1]) for rel in relations2])
                # Calculate Jaccard similarity between dependency relations
                similarity_matrix[i, j] = len(set(str_relations1.split()).intersection(set(str_relations2.split()))) / \
                                          len(set(str_relations1.split()).union(set(str_relations2.split())))
    
    return similarity_matrix



In [10]:
def topic_modeling_similarity(sentences, num_topics=5):
    """
    Calculate similarity between pairs of sentences based on their topic distributions using Latent Dirichlet Allocation (LDA).
    
    Args:
    sentences (list): List of sentences.
    num_topics (int): Number of topics for LDA.
    
    Returns:
    numpy.ndarray: Topic modeling similarity matrix between pairs of sentences.
    """
    # Vectorize sentences using bag-of-words representation
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Perform Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda.fit_transform(X)
    
    # Calculate cosine similarity between topic distributions
    similarity_matrix = cosine_similarity(topic_distributions)
    
    return similarity_matrix

In [11]:
def combine_similarity_matrices(sentences, sentence_embeddings):
    """
    Combine similarity matrices from semantic similarity, lexical overlap, dependency parsing, and topic modeling methods.
    
    Args:
    sentences (list): List of sentences.
    
    Returns:
    numpy.ndarray: Combined similarity matrix between pairs of sentences.
    """
    # Calculate similarity matrices using different methods
    semantic_similarity_matrix = calculate_similarity(sentence_embeddings)
    lexical_overlap_matrix = lexical_overlap_similarity(sentences)
    dependency_parsing_matrix = dependency_parsing_similarity(sentences)
    topic_modeling_matrix = topic_modeling_similarity(sentences)
    
    # Normalize each matrix
    # semantic_similarity_matrix /= np.max(semantic_similarity_matrix)
    # lexical_overlap_matrix /= np.max(lexical_overlap_matrix)
    # dependency_parsing_matrix /= np.max(dependency_parsing_matrix)
    # topic_modeling_matrix /= np.max(topic_modeling_matrix)
    
    # Combine matrices by averaging element-wise
    combined_matrix = (semantic_similarity_matrix + lexical_overlap_matrix + 
                       dependency_parsing_matrix + topic_modeling_matrix) / 4
    
    return combined_matrix


In [12]:
def construct_graph(similarity_matrix):
    """
    Construct a graph representation of the document based on the similarity matrix.
    
    Args:
    similarity_matrix (numpy.ndarray): Combined similarity matrix between pairs of sentences.
    
    Returns:
    networkx.Graph: Graph representation of the document.
    """
    # Create an empty graph
    graph = nx.Graph()
    
    # Add nodes to the graph
    num_sentences = len(similarity_matrix)
    graph.add_nodes_from(range(num_sentences))
    
    # Add edges to the graph based on the similarity matrix
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            graph.add_edge(i, j, weight=similarity_matrix[i, j])
    
    return graph


In [13]:
import networkx as nx
from concurrent.futures import ThreadPoolExecutor

def construct_graph_parallel(similarity_matrix):
    """
    Construct a graph representation of the document based on the similarity matrix with parallelization.
    
    Args:
    similarity_matrix (numpy.ndarray): Combined similarity matrix between pairs of sentences.
    
    Returns:
    networkx.Graph: Graph representation of the document.
    """
    # Create an empty graph
    graph = nx.Graph()
    
    # Add nodes to the graph
    num_sentences = len(similarity_matrix)
    graph.add_nodes_from(range(num_sentences))
    
    # Define a function to add edges between pairs of sentences
    def add_edges(i):
        for j in range(i + 1, num_sentences):
            graph.add_edge(i, j, weight=similarity_matrix[i, j])
    
    # Parallelize edge addition using a ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        executor.map(add_edges, range(num_sentences))
    
    return graph


In [14]:
def graph_traversal(graph, sentences, starting_node, threshold):
    """
    Traverse the graph starting from a specific node to collect connected sentences, considering edges above the threshold.
    
    Args:
    graph (networkx.Graph): Graph representation of the document.
    sentences (list): List of sentences.
    starting_node (int): Index of the starting node for traversal.
    threshold (float): Threshold for connectivity. Edges with similarity above this threshold will be considered.
    
    Returns:
    list: List of connected sentences starting from the given node.
    """
    connected_sentences = []
    visited = set()
    
    def dfs(node, connected):
        if node not in visited:
            visited.add(node)
            connected.append(sentences[node])
            for neighbor in graph.neighbors(node):
                if graph.edges[node, neighbor]['weight'] >= threshold:
                    dfs(neighbor, connected)
    
    dfs(starting_node, connected_sentences)
    
    return connected_sentences


In [15]:
from concurrent.futures import ThreadPoolExecutor

def graph_traversal_parallel(graph, sentences, starting_node, threshold):
    """
    Traverse the graph starting from a specific node to collect connected sentences, considering edges above the threshold, with parallelization.
    
    Args:
    graph (networkx.Graph): Graph representation of the document.
    sentences (list): List of sentences.
    starting_node (int): Index of the starting node for traversal.
    threshold (float): Threshold for connectivity. Edges with similarity above this threshold will be considered.
    
    Returns:
    list: List of connected sentences starting from the given node.
    """
    connected_sentences = []
    visited = set()
    
    def dfs(node, connected):
        if node not in visited:
            visited.add(node)
            connected.append(sentences[node])
            for neighbor in graph.neighbors(node):
                if graph.edges[node, neighbor]['weight'] >= threshold:
                    dfs(neighbor, connected)
    
    # Define a function to perform DFS traversal starting from the specified node
    def traverse():
        dfs(starting_node, connected_sentences)
    
    # Parallelize DFS traversal using a ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        executor.submit(traverse).result()
    
    return connected_sentences


In [16]:
doc_path = "../../data/test/ww2.pdf"

sents = read_document(doc_path)

In [17]:
sents

['World War II, 1939–1945\nPreviewing Main Ideas\nGermany, Italy, and Japan tried to build empires.',
 'They\nbegan their expansion by conquering other nations and dominating them\npolitically and economically.',
 'Geography What areas did the Axis powers control at the height of \ntheir power?',
 'Far-reaching developments in science and\ntechnology changed the course of World War II.',
 'Improvements in aircraft,tanks, and submarines and the development of radar and the atomic bombdrastically altered the way wars were fought.',
 'Geography Why might submarines have been a key weapon for the Axis\npowers in their fight against Great Britain?',
 'Fighting the Axis terror weakened the economies of Great\nBritain, the Soviet Union, and other European countries.',
 'In contrast, when theUnited States entered the war, its economy grew sharply.',
 'The strength of theAmerican economy bolstered the Allied war effort.',
 'Geography In terms of location, why was the American economy able to\nf

In [18]:
len(sents)

1086

In [19]:
%%time

sent_embeds = create_embeddings_parallel_gpu(sents)

CPU times: total: 15.9 s
Wall time: 1min 31s


In [20]:
sent_embeds.shape

torch.Size([1086, 768])

In [21]:
%%time

sim_matrix = calculate_similarity(sent_embeds)

CPU times: total: 0 ns
Wall time: 4.01 ms


In [22]:
pd.set_option('display.max_colwidth', None)

In [23]:
connec_df = visualize_connectivity(sents, sim_matrix)

In [24]:
connec_df.head()

Unnamed: 0,"World War II, 1939–1945\nPreviewing Main Ideas\nGermany, Italy, and Japan tried to build empires.",They\nbegan their expansion by conquering other nations and dominating them\npolitically and economically.,Geography What areas did the Axis powers control at the height of \ntheir power?,Far-reaching developments in science and\ntechnology changed the course of World War II.,"Improvements in aircraft,tanks, and submarines and the development of radar and the atomic bombdrastically altered the way wars were fought.",Geography Why might submarines have been a key weapon for the Axis\npowers in their fight against Great Britain?,"Fighting the Axis terror weakened the economies of Great\nBritain, the Soviet Union, and other European countries.","In contrast, when theUnited States entered the war, its economy grew sharply.",The strength of theAmerican economy bolstered the Allied war effort.,"Geography In terms of location, why was the American economy able to\nfunction at a high level while the European economies struggled?ECONOMICSSCIENCE AND TECHNOLOGYEMPIRE BUILDING\n488\n•Interactive Maps\n•Interactive Visuals\n•Interactive Primary Sources\nVIDEOPatterns of Interaction:\nModern and Medieval\nWeapons INTERNET RESOURCES\nGo to classzone.com for:\n•Research Links •Maps\n•Internet Activities •Test Practice\n•Primary Sources •Current Events\n•Chapter Quiz489Under what circumstances \nis war justified?",...,I took just as many as could be packed in our [hotel] rooms.,.,..1,..2,"The next day dragged wearily along, everybody\nwaiting, living only to hear better news.",The city wasrapidly filling with refugees.,"In one place, an old convent, they were given a roof to sleep under, and hot tea.","DOCUMENT-BASED QUESTION\nUnder what conditions did the Polish refugees fleefrom the Germans?Internment Camps\nAfter Pearl Harbor, thousands of\nJapanese Americans were sent tointernment camps mainly located in the western United States.","DOCUMENT-BASED QUESTION\nJudging from the photograph, what was the government’s attitude towardJapanese Americans?",525
"World War II, 1939–1945\nPreviewing Main Ideas\nGermany, Italy, and Japan tried to build empires.",1.0,0.805818,0.80448,0.816161,0.841495,0.82563,0.841063,0.82317,0.820499,0.767116,...,0.706264,0.535857,0.535857,0.535857,0.723805,0.745968,0.659667,0.813989,0.765771,0.575723
They\nbegan their expansion by conquering other nations and dominating them\npolitically and economically.,0.805818,1.0,0.768532,0.79593,0.830561,0.747193,0.822113,0.810873,0.819812,0.685545,...,0.720315,0.537269,0.537269,0.537269,0.72048,0.752635,0.680907,0.758648,0.716001,0.578743
Geography What areas did the Axis powers control at the height of \ntheir power?,0.80448,0.768532,1.0,0.764458,0.800609,0.927956,0.776941,0.765889,0.766312,0.765654,...,0.727083,0.534693,0.534693,0.534693,0.723234,0.772037,0.684505,0.768753,0.837936,0.622159
Far-reaching developments in science and\ntechnology changed the course of World War II.,0.816161,0.79593,0.764458,1.0,0.9278,0.779943,0.858468,0.828736,0.872368,0.712012,...,0.717956,0.580826,0.580826,0.580826,0.758054,0.767645,0.663294,0.770476,0.734367,0.584801
"Improvements in aircraft,tanks, and submarines and the development of radar and the atomic bombdrastically altered the way wars were fought.",0.841495,0.830561,0.800609,0.9278,1.0,0.832401,0.878267,0.838749,0.874408,0.729619,...,0.714302,0.55539,0.55539,0.55539,0.732883,0.761133,0.675493,0.76261,0.715161,0.560806


In [25]:
%%time

graph = construct_graph_parallel(sim_matrix)

CPU times: total: 11.1 s
Wall time: 18.3 s


In [26]:
%%time

summary = graph_traversal_parallel(graph, sents, 0, 0.8)

CPU times: total: 3min 29s
Wall time: 3min 58s


In [27]:
summary

['World War II, 1939–1945\nPreviewing Main Ideas\nGermany, Italy, and Japan tried to build empires.',
 'They\nbegan their expansion by conquering other nations and dominating them\npolitically and economically.',
 'Improvements in aircraft,tanks, and submarines and the development of radar and the atomic bombdrastically altered the way wars were fought.',
 'Geography What areas did the Axis powers control at the height of \ntheir power?',
 'Geography Why might submarines have been a key weapon for the Axis\npowers in their fight against Great Britain?',
 'Fighting the Axis terror weakened the economies of Great\nBritain, the Soviet Union, and other European countries.',
 'Far-reaching developments in science and\ntechnology changed the course of World War II.',
 'In contrast, when theUnited States entered the war, its economy grew sharply.',
 'The strength of theAmerican economy bolstered the Allied war effort.',
 'Each time the Nazi dictator grabbed new territory, he\nwould declare an

In [28]:
len(summary)

1075