In [None]:
import os
import numpy as np
from nltk import word_tokenize
import copy
from collections import defaultdict
from tqdm import tqdm
import math

In [None]:
def preprocess(unit):
    remove = ['.', ',', '!', '?', '[', ']', '{', '}', '(', ')', '*', '&', '#', ':', ';', '"', "'", '-', '+', '_', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    remove2= ['\n', '”', '“', '’']
    unit2 = copy.deepcopy(unit)
    for c in remove:
        unit2 = unit2.replace(c, '')
    for c in remove2:
        unit2 = unit2.replace(c, ' ')
    unit = unit2
    words = word_tokenize(unit)
    words2 = []
    for word in words:
        word = word.lower()
        words2.append(word)
    return words2

In [None]:
def get_documents_tokenized(path):
    documents = []
    documents_tokenized = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r") as f:
                content = f.read()
                documents.append(content)
    for document in documents:
        documents_tokenized.append(preprocess(document))
    return documents_tokenized

In [None]:
def train_tf_idf(path):
    documents_tokenized = get_documents_tokenized(path)
    tf = defaultdict(lambda: {})
    freq = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        tf[i] = defaultdict(lambda: 0)
        tokens = documents_tokenized[i]
        for token in tokens:
            freq[token] += 1
            tf[i][token] += 1

    for i in tqdm(range(len(documents_tokenized))):
        for token in tf[i].keys():
            tf[i][token] = tf[i][token] / len(documents_tokenized[i])
    
    idf = defaultdict(lambda: 0)
    ndoc = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        temp = defaultdict(lambda: 0)
        tokens = documents_tokenized[i]
        for token in tokens:
            if(temp[token] == 0):
                idf[token] += 1
                temp[token] += 1

    for token in tqdm(idf.keys()):
        ndoc[token] = idf[token]
        idf[token] = math.log(len(documents_tokenized) / idf[token])

    tf_idf = defaultdict(lambda: 0)
    for i in tqdm(range(len(documents_tokenized))):
        tf_idf[i] = defaultdict(lambda: 0)
        for token in documents_tokenized[i]:
            tf_idf[i][token] = tf[i][token] * idf[token]
    
    return tf_idf, idf, ndoc, tf, documents_tokenized

In [None]:
def get_tf_query(query):
    k = len(query)
    tf_query = defaultdict(lambda: 0)
    for i in range(k):
        tf_query[query[i]] += 1
    for token in tf_query.keys():
        tf_query[token] /= k
    return tf_query

def get_tf_idf_query(query, idf_dict):
    query = preprocess(query)
    tf_idf_query = defaultdict(lambda: 0)
    tf_query = get_tf_query(query)
    for token in tf_query.keys():
        tf_idf_query[token] = tf_query[token] * idf_dict[token]
    return tf_idf_query

def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def tf_idf_rankings(query, idf_dict, tf_idf_dict):
    query_vector = get_tf_idf_query(query, idf_dict)
    scores = []
    for i in tqdm(range(len(list(tf_idf_dict.keys())))):
        document_vector = tf_idf_dict[i]
        scores.append((i, cosine_similarity(document_vector, query_vector)))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores

In [1]:
!pip install sentence-transformers



In [2]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')



In [None]:
def get_open_source_embeddings(path):
    documents = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r") as f:
                content = f.read()
                documents.append(content)
    documents_embeddings = model.encode(documents)
    return documents_embeddings
    
def open_source_rankings(query, document_embeddings):
    query_embedding = model.encode(query)
    scores = []
    for idx, embedding in enumerate(document_embeddings):
        scores.append((cosine_similarity(query_embedding, embedding), idx))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores

In [None]:
def IDF(token, ndoc_dict, document_set):
    k = math.log(((len(document_set) - ndoc_dict[token] + 0.5) / (ndoc_dict[token] + 0.5)) + 1)
    return k

def get_avgdl(document_set):
    sum = 0
    for document in document_set:
        sum = sum + len(document)
    sum = sum / len(document_set)
    return sum

k1 = 1.75
b = 0.75

def bm25(query, document_number, document_set, tf_dict, ndoc_dict, avgdl):
    sum = 0
    for token in query:
        f = tf_dict[document_number][token]
        idf_query = IDF(token, ndoc_dict, document_set)
        sum = sum + ((idf_query * f * (k1 + 1)) / (f + (k1 * (1 - b + (b * (len(document_set[document_number]) / avgdl))))))
    return sum

def get_bm25_rankings(query, documents_tokenized, tf_dict):
    avgdl = get_avgdl(documents_tokenized)
    query = preprocess(query)
    scores = []
    for idx, document in enumerate(documents_tokenized):
        scores.append((idx, bm25(query, idx, documents_tokenized, tf_dict, avgdl)))
