In [None]:
%pip install nltk transformers torch annoy seaborn matplotlib scikit-learn PyPDF2 plotly


In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from annoy import AnnoyIndex
import PyPDF2
import os
import pickle
from tqdm import tqdm


nltk.download('punkt')
nltk.download('stopwords')


In [3]:
#LOADING THE FILE TEXT IN

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = " ".join([page.extract_text() for page in pdf_reader.pages if page.extract_text() is not None])
    return text




In [5]:
#PREPROCESSING THE TEXT

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words


In [4]:
from collections import defaultdict

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# Modify the embedding function for batch processing
def embed_words_batch(words):
    inputs = tokenizer(words, padding=True, return_tensors='pt', truncation=True)
    outputs = model(**inputs)
    return outputs.hidden_states[-1][:, 0, :].detach().numpy()

# Creating embeddings with caching and batch processing
def create_embedding_dictionary_batch(file_path, batch_size=10):
    document_text = read_pdf(file_path)
    words = preprocess_text(document_text)
    unique_words = list(set(words))  # Unique words for caching

    # Caching embeddings
    cached_embeddings = defaultdict(lambda: None)
    embeddings_dict = {}

    for i in range(0, len(unique_words), batch_size):
        batch_words = unique_words[i:i+batch_size]
        batch_embeddings = embed_words_batch(batch_words)

        for word, embedding in zip(batch_words, batch_embeddings):
            cached_embeddings[word] = embedding

    for word in words:
        embeddings_dict[word] = {
            'embedding': cached_embeddings[word],
            'file': file_path
        }

    return embeddings_dict


def process_multiple_pdfs(folder_path, n, batch=10):
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    processed_files = all_files[:n]  # Process only the first n files

    all_embeddings = {}
    for file in tqdm(processed_files, desc="Processing PDFs"):
        file_path = os.path.join(folder_path, file)
        try:
            embeddings_dict = create_embedding_dictionary_batch(file_path, batch)
            all_embeddings.update(embeddings_dict)
        except Exception as e:  # Catching a more general exception
            print(f"Error reading PDF: {file}. Skipping this file.")
            continue

    return all_embeddings

In [5]:
def build_annoy_index(embeddings_dict):
    f = list(embeddings_dict.values())[0]['embedding'].shape[0]
    t = AnnoyIndex(f, 'angular')
    for i, (word, data) in enumerate(embeddings_dict.items()):
        t.add_item(i, data['embedding'])
    t.build(10)
    return t

def query_similar_words(query, index, embeddings_dict, top_n=5):
    query_embedding = embed_words_batch([query])[0]  # Embed the query word
    nearest_ids = index.get_nns_by_vector(query_embedding, top_n)

    similar_words_with_titles = []
    for i in nearest_ids:
        word = list(embeddings_dict.keys())[i]
        title = embeddings_dict[word]['file'].split('/')[-1]  # Extract the file name
        similar_words_with_titles.append((word, title))

    return similar_words_with_titles



In [None]:
folder_path = '/Users/aayushgupta/Desktop/PapersDirectory/papers'
num_papers_to_process = 100  # Replace with the number of papers you want to process

embeddings_dict = process_multiple_pdfs(folder_path, num_papers_to_process, 40)


# # Create embeddings dictionary
# filepath = 'path'
# embeddings_dict = create_embedding_dictionary_batch(file_path, 40)

In [13]:
# Assuming embeddings_dict is your dictionary
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)


In [9]:
with open('embeddings.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)


In [10]:
# Build Annoy index
annoy_index = build_annoy_index(embeddings_dict)

In [None]:
# Query for similar words
query_word = "few-shot"  # Replace with your query word
similar_words = query_similar_words(query_word, annoy_index, embeddings_dict, top_n=25)
print(f"Words similar to '{query_word}': {similar_words}")

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.cluster import KMeans

def optimized_plot_embeddings(embeddings_dict, query, index, top_n=5):
    # Extract existing embeddings and the query embedding from the dictionary
    words, embeddings = zip(*[(word, data['embedding']) for word, data in embeddings_dict.items() if data['embedding'] is not None])
    query_embedding = embeddings_dict.get(query, {'embedding': None})['embedding']
    
    # Handle case where query embedding is not pre-computed
    if query_embedding is None:
        query_embedding = embed_words_batch([query])[0]

    extended_embeddings = np.vstack(embeddings + (query_embedding,))
    
    # TSNE for dimensionality reduction (can be cached for the same dataset)
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(extended_embeddings)

    # KMeans for clustering (can be cached for the same dataset)
    kmeans = KMeans(n_clusters=5)
    clusters = kmeans.fit_predict(extended_embeddings)

    # Find nearest neighbors using the pre-built index
    indices = index.get_nns_by_vector(query_embedding, top_n)

    # Plotting
    sns.set(style='whitegrid')
    plt.figure(figsize=(12, 8))

    # Plot all points with cluster coloring
    plt.scatter(tsne_results[:-1, 0], tsne_results[:-1, 1], c=clusters[:-1], alpha=0.5, cmap='viridis')

    # Highlight the query point
    plt.scatter(tsne_results[-1, 0], tsne_results[-1, 1], color='blue', marker='X')

    # Highlight nearest neighbors with red outline
    for i in indices:
        plt.scatter(tsne_results[i, 0], tsne_results[i, 1], facecolors='none', edgecolors='red', s=100)
        plt.annotate(words[i], (tsne_results[i, 0], tsne_results[i, 1]), textcoords="offset points", xytext=(0,10), ha='center')

    # Label the query
    plt.annotate("Query", (tsne_results[-1, 0], tsne_results[-1, 1]), textcoords="offset points", xytext=(0,10), ha='center')

    plt.title('Optimized t-SNE Visualization of Word Embeddings with Query and Nearest Neighbors')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.show()


# Example query word
query_word = "gpt4"  # Replace with your query word

# Plotting the embeddings assuming all previous steps have been completed
optimized_plot_embeddings(embeddings_dict, query_word, annoy_index, top_n=25)



In [None]:
import plotly.graph_objects as go
import numpy as np
import seaborn as sns

def plot_embeddings_3d_interactive(embeddings_dict, query, index, top_n=5):
    words, embeddings = zip(*[(word, data['embedding']) for word, data in embeddings_dict.items() if data['embedding'] is not None])
    query_embedding = embeddings_dict.get(query, {'embedding': None})['embedding']
    
    if query_embedding is None:
        query_embedding = embed_words_batch([query])[0]

    extended_embeddings = np.vstack(embeddings + (query_embedding,))

    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(extended_embeddings)

    # Convert Seaborn colors to RGB format acceptable by Plotly
    colors = sns.color_palette("hsv", len(tsne_results))
    colors_rgb = ['rgb' + str(tuple(int(x*255) for x in color)) for color in colors]

    fig = go.Figure()

    # Plot all points with unique colors
    for i, (x, y, z) in enumerate(tsne_results[:-1]):
        fig.add_trace(go.Scatter3d(
            x=[x],
            y=[y],
            z=[z],
            mode='markers',
            marker=dict(size=5, color=colors_rgb[i], opacity=0.5)
        ))

    # Highlight and label the query point
    fig.add_trace(go.Scatter3d(
        x=[tsne_results[-1, 0]],
        y=[tsne_results[-1, 1]],
        z=[tsne_results[-1, 2]],
        mode='markers+text',
        marker=dict(size=8, color='blue'),
        text=[query],
        textposition="bottom center"
    ))

    # Find, highlight, and label nearest neighbors
    indices = index.get_nns_by_vector(query_embedding, top_n)
    for i in indices:
        fig.add_trace(go.Scatter3d(
            x=[tsne_results[i, 0]],
            y=[tsne_results[i, 1]],
            z=[tsne_results[i, 2]],
            mode='markers+text',
            marker=dict(size=8, color='red'),
            text=[words[i]],
            textposition="bottom center"
        ))

    fig.update_layout(
        margin=dict(l=0, r=0, b=0, t=0),
        scene=dict(
            xaxis_title='t-SNE Component 1',
            yaxis_title='t-SNE Component 2',
            zaxis_title='t-SNE Component 3'
        )
    )

    fig.show()


# Assuming embeddings_dict and annoy_index are already created
# Example query word
query_word = "gpt4"  # Replace with your query word

# Plotting the embeddings in 3D with an interactive plot and unique colors
plot_embeddings_3d_interactive(embeddings_dict, query_word, annoy_index, top_n=25)
