In [5]:
!pip install gensim



In [6]:
!pip install sentence-transformers



In [7]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [8]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25ldone
[?25h  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=c5a654586020ff0f1ff6f3a95e800416297cf32aca24f3b1a44f094328cd37f7
  Stored in directory: /home/yash/.cache/pip/wheels/6e/62/11/dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [10]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


In [1]:
import os
import numpy as np
from nltk import word_tokenize
import copy
from collections import defaultdict
from tqdm import tqdm
import math
import json
from gensim.utils import simple_preprocess
import joblib
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
from transformers import ViTModel, ViTFeatureExtractor, ViTImageProcessor
from PIL import Image
import torch
from fpdf import FPDF
from datetime import datetime
import fitz

  from tqdm.autonotebook import tqdm, trange


In [2]:
import random

def get_documents():
    ids = []
    documents = []
    with open('../Dataset.mini_wiki_collection.json', "r") as f:
        data = json.load(f)
        for item in data:
            ids.append(item["_id"])
            temp = ""
            for text in item["text"]:
                if(text[-1] == '\n'):
                    temp = temp + text[:-1] + " "
                else:
                    temp = temp + text + " "
            documents.append(temp)
    return ids, documents

In [3]:
def get_documents_from_scores(scores):
    rankings = []
    for score in scores:
        rankings.append(score[0])
    return rankings

In [None]:
def train_tf_idf(documents_tokenized):
    # documents_tokenized = get_documents_tokenized(path)
    vocab = {}
    tf = defaultdict(lambda: {})
    freq = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        tf[i] = defaultdict(lambda: 0)
        tokens = documents_tokenized[i]
        for token in tokens:
            freq[token] += 1
            tf[i][token] += 1
            if token not in vocab:
                vocab[token] = 1

    for i in tqdm(range(len(documents_tokenized))):
        for token in tf[i].keys():
            tf[i][token] = tf[i][token] / len(documents_tokenized[i])
    
    idf = defaultdict(lambda: 0)
    ndoc = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        temp = defaultdict(lambda: 0)
        tokens = documents_tokenized[i]
        for token in tokens:
            if(temp[token] == 0):
                idf[token] += 1
                temp[token] += 1

    for token in tqdm(idf.keys()):
        ndoc[token] = idf[token]
        idf[token] = math.log(len(documents_tokenized) / idf[token])

    tf_idf = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        tf_idf[i] = defaultdict(lambda: 0)
        for token in documents_tokenized[i]:
            tf_idf[i][token] = tf[i][token] * idf[token]
    
    return tf_idf, idf, ndoc, tf, vocab

def get_tf_query(query):
    k = len(query)
    tf_query = defaultdict(lambda: 0)
    for i in range(k):
        tf_query[query[i]] += 1
    for token in tf_query.keys():
        tf_query[token] /= k
    return tf_query

def get_tf_idf_query(query, idf_dict):
    query = simple_preprocess(query)
    tf_idf_query = defaultdict(lambda: 0)
    tf_query = get_tf_query(query)
    for token in tf_query.keys():
        tf_idf_query[token] = tf_query[token] * idf_dict[token]
    return tf_idf_query
    
def get_tf_idf_vector(tf_idf_instance, vocab):
    temp = []
    for key in vocab.keys():
        temp.append(tf_idf_instance[key])
    return temp
    
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
        sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    else:
        sim = 0
    return sim

def print_non_zero(dict):
    for key in dict.keys():
        if(dict[key] > 0):
            print(dict[key])

def get_document_vectors(tf_idf_dict, vocab):
    document_vectors = []
    for i in tqdm(range(len(list(tf_idf_dict.keys())))):
        document_vector = get_tf_idf_vector(tf_idf_dict[i], vocab)
        document_vectors.append(document_vector)
    return document_vectors


def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k):
    query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1))
    scores = []
    dot_products = document_matrix @ query_vector.T

    query_norm = np.linalg.norm(query_vector)
    doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True)
    cosine_similarities = dot_products / (doc_norms * query_norm)
    cosine_similarities = cosine_similarities.flatten()
    rankings = np.argsort(cosine_similarities)[::-1]
    rankings = rankings[:k]
    scores = []
    for rank in rankings:
        scores.append(cosine_similarities[rank])
    # scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)
    # scores = scores[:k]
    # rankings = get_documents_from_scores(scores)
    return rankings, scores

def tf_idf_pipeline(query, idf_dict_path, tf_idf_dict_path, vocab_path, document_matrix_path, ids_path, k):
    idf_dict = joblib.load(idf_dict_path)
    print("idf loaded...")
    tf_idf_dict = joblib.load(tf_idf_dict_path)
    print("tf-idf loaded...")
    vocab = joblib.load(vocab_path)
    print("vocab loaded...")
    document_matrix = joblib.load(document_matrix_path)
    print("document_matrix loaded...")
    ids = joblib.load(ids_path)
    print("ids loaded")
    rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k)
    rankings2 = []
    for ranking in tqdm(rankings):
        rankings2.append(ids[ranking])
    return rankings2

In [None]:
tf_idf_pipeline("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?", 'idf.pkl', 'tf_idf_dict.pkl', 'vocab.pkl', 'document_matrix.pkl', 'ids.pkl', 5)

TypeError: tf_idf_pipeline() missing 6 required positional arguments: 'idf_dict_path', 'tf_idf_dict_path', 'vocab_path', 'document_matrix_path', 'ids_path', and 'k'

In [6]:
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')



In [7]:
def get_open_source_embeddings(documents):
    documents_embeddings = []
    for document in tqdm(documents):
        documents_embeddings.append(model.encode(document))
    return documents_embeddings
    
def open_source_rankings(query, document_embeddings, k):
    query_embedding = model.encode(query)
    scores = []
    for idx, embedding in enumerate(document_embeddings):
        scores.append((idx, cosine_similarity(query_embedding, embedding)))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[:k]
    rankings = get_documents_from_scores(scores)
    return rankings, scores
def open_source_pipeline(query, documents_embeddings_path, ids_path, k):
    document_embeddings = joblib.load(documents_embeddings_path)
    ids = joblib.load(ids_path)
    rankings, scores = open_source_rankings(query, document_embeddings, k)
    rankings2 = []
    for ranking in tqdm(rankings):
        rankings2.append(ids[ranking])
    return rankings2

In [8]:
open_source_pipeline("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?", 'open_source_embeddings.pkl', 'ids.pkl', 5)

100%|██████████| 5/5 [00:00<00:00, 93622.86it/s]


['54376', '2015257', '31739', '24113', '31187327']

In [9]:
def bm25_pipeline(query, bm25_path, ids_path, k):
    bm25 = joblib.load(bm25_path)
    ids = joblib.load(ids_path)
    ranking = bm25.get_scores(simple_preprocess(query))
    ranking = np.argsort(np.array(ranking))[::-1]
    ranking = ranking[:k]
    for j in range(len(ranking)):
        ranking[j] = ids[ranking[j]]
    return ranking

In [10]:
bm25_pipeline("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?", 'bm25-1_0.pkl', 'ids.pkl', 5)

array([ 260962,  232530, 3414021,   54376, 2015257])

In [11]:
def pdf_to_image(pdf_path, zoom=2.0):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Create a list to store image paths
    image_paths = []
    
    # Create an 'Images' directory if it doesn't exist
    os.makedirs("Images", exist_ok=True)
    
    # Iterate over PDF pages and convert each to an image
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)  # Load the page
        
        # Set zoom level to improve quality
        mat = fitz.Matrix(zoom, zoom)  # Create a transformation matrix with the zoom level
        pix = page.get_pixmap(matrix=mat)  # Render the page to an image with the specified zoom
        
        image_file = f'Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
        pix.save(image_file)  # Save the image as PNG
        image_paths.append(image_file)
    
    # Return the list containing paths of all images
    return image_paths

In [12]:
def create_pdf(input_text):
    # Create instance of FPDF class
    pdf = FPDF()
    
    # Add a page
    pdf.add_page()
    
    # Set font
    pdf.set_font("Arial", size=10)
    
    # Split the input text into multiple lines if necessary
    # This ensures that the text fits the page and multiple pages are handled
    pdf.multi_cell(0, 5, txt=input_text)
    
    # Create a unique file name with the current time
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f"PDFs/Aditya_{timestamp}.pdf"
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    
    # Save the PDF
    pdf.output(file_name)
    
    # Return the file path
    return file_name

In [13]:
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')

In [14]:
import re

def sanitize_text(text):
    """
    Cleans and standardizes text by keeping only alphanumeric characters and spaces.
    Args:
        text (str): Text to sanitize.
    Returns:
        str: Sanitized text.
    """
    if isinstance(text, str):
        # Use regex to keep only alphanumeric characters and spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Optionally, collapse multiple spaces into a single space
        text = re.sub(r'\s+', ' ', text).strip()
    return text

def text_to_images(text):
    text = sanitize_text(text)
    pdf_path = create_pdf(text)
    image_paths = pdf_to_image(pdf_path)
    return image_paths

In [15]:
def documents_to_images(path):
    document_set = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r") as f:
                content = f.read()
                document_set.append(content)
    document_image_paths = []
    for document in document_set:
        image_paths = text_to_images(document)
        document_image_paths.append(image_paths)
    return document_image_paths

def single_unit_embedding(text):
    image_paths = text_to_images(text)
    temp = []
    for image_path in image_paths:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt")
        outputs = model(**inputs)
        vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        temp.append(vector)
    return np.mean(np.array(temp), axis=0)

def single_image_embedding(image):
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return vector

def documents_to_vision_embeddings(documents):
    document_vision_embeddings = []
    for document in tqdm(documents):
        vector = single_unit_embedding(document)
        document_vision_embeddings.append(vector)
    return document_vision_embeddings

def queries_to_vision_embeddings(queries):
    query_vision_embeddings = []
    for query in tqdm(queries):
        vector = single_unit_embedding(query)
        query_vision_embeddings.append(vector)
    return query_vision_embeddings

def vision_rankings(query_embedding, document_embeddings, k):
    # query_embedding = single_unit_embedding(query)
    scores = []
    for idx, embedding in enumerate(document_embeddings):
        scores.append((idx, cosine_similarity(query_embedding[0], embedding[0])))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[:k]
    rankings = get_documents_from_scores(scores)
    return rankings, scores

def vision_pipeline(query, document_embeddings_path, ids_path, k):
    # document_embeddings = joblib.load(document_embeddings_path)
    ids = joblib.load(ids_path)
    documents_vision_embeddings2 = []
    with open("document-vision-embeddings.json", "r") as f:
        document_vision_embeddings2 = json.load(f)
    document_vision_embeddings = []
    for embedding in tqdm(document_vision_embeddings2):
        document_vision_embeddings.append(np.array(embedding))
    print("loaded embeddings")
    query_embedding = single_unit_embedding(query)
    rankings, scores = vision_rankings(query_embedding, document_vision_embeddings, k)
    rankings2 = []
    for ranking in rankings:
        rankings2.append(ids[ranking])
    return rankings2

In [16]:
vision_pipeline("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?", 'document-vision-embeddings.json', 'ids.pkl', 5)

100%|██████████| 1000/1000 [00:00<00:00, 35600.46it/s]


loaded embeddings


['48272667', '53470812', '50170741', '19865700', '39790870']