In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from math import log10, sqrt,log

In [20]:
from tqdm import tqdm

In [3]:
WikiHow_sample_all = pd.read_csv('WikiHow_sample_all_withsummary.csv')

In [4]:
def preprocess(documents):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(documents)
    tokens =  [token.lower() for token in tokens if token.isalpha()]
    tokens = remove_stopwords(tokens)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

In [7]:
def remove_stopwords(tokens):
    new_tokens = []
    stop_words = set(stopwords.words("english"))
    for token in tokens:
        if token not in stop_words:
            new_tokens.append(token)
    return new_tokens

In [5]:
def get_inverted_index(data):
    n = len(data[data['summary']!='empty'])
    df = data.loc[:n,:]
    inverted_index = {}
    for i in range(len(df)):
        tokens = preprocess(df.loc[i,'summary'])
        tokens_dist = nltk.FreqDist(tokens)
        for voc in tokens_dist.keys():
            if voc not in inverted_index.keys():
                inverted_index[voc] = [1, tokens_dist[voc]]
            else:
                inverted_index[voc][0] += 1
                inverted_index[voc][1] += tokens_dist[voc]
    return inverted_index

In [8]:
inverted_index = get_inverted_index(WikiHow_sample_all)

In [12]:
def length(data):
    total_len = 0
    for i in range(len(data)):
        passage_len = len(preprocess(data.loc[i,'summary']))
        total_len += passage_len
    total_pa = len(data)
    return total_len/total_pa, total_pa

avdl, N = length(WikiHow_sample_all)

In [21]:
def BM25(data, inverted_index,avdl,N, k1 = 1.2, k2 = 100,b = 0.75):
    scores = np.zeros((len(data),3))
    for i in tqdm(range(len(data))):
        tokens_p = preprocess(data.loc[i,'summary'])
        tokens_q = preprocess(data.loc[i,'title'])
        f_p = nltk.FreqDist(tokens_p)
        f_q = nltk.FreqDist(tokens_q)
        dl = len(tokens_p)
        K = k1*((1-b)+b*(dl/avdl))
        bm25 = 0
        for token in f_q.keys():
            if token in inverted_index.keys():
                term1 =  log((N-inverted_index[token][0]+0.5)/(inverted_index[token][0]+0.5))
                term2 = (k1+1)*f_p[token]/(K+f_p[token])
                term3 = (k2+1)*f_q[token]/(k2+f_q[token])
                bm25 += term1 *term2 *term3
        data.loc[i,'bm25'] = bm25
    return data

In [22]:
WikiHow_sample_all = BM25(WikiHow_sample_all, inverted_index,avdl,N, k1 = 1.2, k2 = 100,b = 0.75)

100%|███████████████████████████████████| 49642/49642 [00:36<00:00, 1374.88it/s]


In [27]:
WikiHow_sample_all.to_csv('WikiHow_sample_all_withsummary.csv')