In [None]:
%pip install nltk transformers torch annoy seaborn matplotlib scikit-learn PyPDF2 plotly


In [2]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import PyPDF2
import os
import pickle
from tqdm import tqdm
from huggingface_hub import snapshot_download
import os
import numpy as np
import PyPDF2
from annoy import AnnoyIndex



from collections import defaultdict
from tqdm import tqdm
from multiprocessing import Pool
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torch.nn import DataParallel
import torch
from paper_processing_for_embeddings import preprocess_and_read 

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aayushgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayushgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# If multiple GPUs are available, use DataParallel
if torch.cuda.device_count() > 1:
    model = DataParallel(model)

model = model.to(device)



In [4]:
def embed_words_batch(words):
    inputs = tokenizer(words, padding=True, return_tensors='pt', truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the appropriate device
    outputs = model(**inputs)
    return outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()


In [5]:
def build_annoy_index(embeddings_dict):
    f = list(embeddings_dict.values())[0]['embedding'].shape[0]
    t = AnnoyIndex(f, 'angular')
    for i, (word, data) in enumerate(embeddings_dict.items()):
        t.add_item(i, data['embedding'])
    t.build(10)
    return t

def query_similar_words(query, index, embeddings_dict, top_n=5):
    query_embedding = embed_words_batch([query])[0]  # Embed the query word
    nearest_ids = index.get_nns_by_vector(query_embedding, top_n)

    similar_words_with_titles = []
    for i in nearest_ids:
        word = list(embeddings_dict.keys())[i]
        title = embeddings_dict[word]['file'].split('/')[-1]  # Extract the file name
        similar_words_with_titles.append((word, title))

    return similar_words_with_titles



In [6]:
with open('word_embeddings.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)


In [7]:
# Build Annoy index
annoy_index = build_annoy_index(embeddings_dict)

In [8]:
import re

def normalize_text(text):
    """
    Enhances normalization of text by lowercasing, replacing hyphens and underscores, 
    and removing non-alphanumeric characters.
    """
    text = text.lower()
    text = re.sub(r'[\-\_]', ' ', text)  # Replace hyphens and underscores with spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = ' '.join(text.split())  # Remove extra spaces
    return text

def user_approval(technique):
    """
    Asks for user approval for a new technique.
    """
    response = input(f"Approve technique '{technique}'? (y/n): ").strip().lower()
    return response == 'y'

def find_nearest_neighbors(techniques, seen_techniques, embeddings_dict, annoy_index, depth=0, max_depth=10):
    """
    Recursively finds nearest neighbors for a list of techniques up to a certain depth.
    Each new technique requires user approval.
    """
    if depth >= max_depth:
        print(f"Maximum depth reached: {max_depth}")
        return techniques

    new_techniques = set()
    for technique in techniques:
        similar_words = query_similar_words(technique, annoy_index, embeddings_dict, top_n=10)
        for word, _ in similar_words:
            normalized_word = normalize_text(word)
            if normalized_word not in seen_techniques:
                seen_techniques.add(normalized_word)
                if user_approval(normalized_word):
                    new_techniques.add(normalized_word)

    # Check for new techniques
    if not new_techniques:
        print("No new approved techniques found")
        return techniques

    # Add new techniques and recurse with updated sets
    updated_techniques = techniques.union(new_techniques)
    print(f"Depth {depth}: Found {len(new_techniques)} new approved techniques")
    return find_nearest_neighbors(updated_techniques, seen_techniques, embeddings_dict, annoy_index, depth+1, max_depth)

# Example usage
initial_techniques = set([
    "Chain of Thought [CoT]",
    "Zero-shot-CoT",
    "Few-Shot-Chain-of-Thought",
    "Plan-and-Solve Prompting",
    "OPRO",
    "Tree-of-Thought",
    "Skeleton-of-Thought",
    "Active-Prompt",
    "Contrastive Chain of Thought",
    "Complexity-based prompting",
    "Faithful Chain-of-Thought",
    "Memory-of-Thought",
    "Recursion of Thought",
    "Auto-Cot",
    "Automate-CoT",
    "Program-of-Thoughts",
    "Tab-CoT",
    "Think Aloud",
    "Golden CoT",
    "ICAP",
    "Graph-of-Thoughts",
    "Self-Evaluation",
    "Self-refine",
    "Verify-and-edit",
    "CRITIC",
    "AuRoRA",
    "Self-Ask",
    "Iterative Prompts",
    "Prompt Mining",
    "Prompt Paraphrasing",
    "Self-improvement framework",
    "Least-to-most prompting",
    "Maieutic Prompting",
    "Directional-stimulus prompting",
    "Automatic Prompt Generation",
    "Self-Instruct",
    "Cumulative Reasoning",
    "In-Context Learning",
    "Few-shot learning (FSL)",
    "Few-shot prompting (FSP)",
    "Input-Label Pairing format",
    "Label Space",
    "Input Distribution",
    "Input-Label mapping",
    "Demonstration Ensembling (DENSE)",
    "Self-Consistency",
    "DiVeRSe",
    "Zero-Shot Prompt",
    "Role Prompting",
    "Style Prompting",
    "Emotion Prompting",
    "Re-reading",
    "Negative Prompting"
])

seen_techniques = set(initial_techniques)  # Initialize seen techniques with the initial set
max_depth = 2  # You can adjust this to control the recursion depth

# Modified recursive function call
all_techniques = find_nearest_neighbors(initial_techniques, seen_techniques, embeddings_dict, annoy_index, max_depth=max_depth)

print("All approved techniques found:")
for technique in all_techniques:
    print(technique)


In [None]:
print 