In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import re

In [23]:
df = pd.read_json('News_Dataset.json', lines = True)

In [24]:
#df = df[:50000]

In [25]:
df.tail(5)

Unnamed: 0,link,headline,category,short_description,authors,date
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28
209526,https://www.huffingtonpost.com/entry/dwight-ho...,Dwight Howard Rips Teammates After Magic Loss ...,SPORTS,The five-time all-star center tore into his te...,,2012-01-28


In [26]:
df.isna().sum() 

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [27]:
df = df.drop_duplicates() 

In [28]:
df.nunique()

link                 209486
headline             207996
category                 42
short_description    187022
authors               29169
date                   3890
dtype: int64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209514 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209514 non-null  object        
 1   headline           209514 non-null  object        
 2   category           209514 non-null  object        
 3   short_description  209514 non-null  object        
 4   authors            209514 non-null  object        
 5   date               209514 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 11.2+ MB


In [30]:
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(df['short_description'])
vocabulary = vectorizer.get_feature_names_out()
vocabulary

array(['00', '000', '0000', ..., 'ﬁrst', 'ﬁx', 'ﬂavors'], dtype=object)

In [31]:
print ('We have a {} document corpus with a {} term vocabulary'.format(*documents_vectorized.shape))

We have a 209514 document corpus with a 75420 term vocabulary


In [32]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
doc_ids = df.index.values


In [33]:
df[:100]

Unnamed: 0,00,000,0000,000x,001,0010,0011,002,004,006,...,плохая,семья,финансирования,харьковского,আইভ,ಠ_ಠ,ﬁnd,ﬁrst,ﬁx,ﬂavors
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) \
                         + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)


In [None]:
bm25_df = BM25_IDF_df(df)  # a dataframe with BM25-idf weights
bm25_df[:5]

In [None]:
queries = dict(enumerate(['ship wreck',
                          'little boat']))

def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values),
                key = lambda tup:tup[1],
                reverse=True)


# Let's look at the first few scores for our query and document combinations:
for count, query in enumerate(queries.values()):
  print(f'Query {count}: {query}')
  print('')
  print(retrieve_ranking(query, bm25_df))
  print('')

In [None]:

def precision_at_k(query_id, k=5):
  """This function considers the k top ranking documents."""
  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)

  # take only the document id, rather than score
  retrieved = [doc[0] for doc in doc_ranking[:k]]

  TP = ...  # number of true positives
  FP = ...  # number of false positives 

  precision = ...

  return TP, FP, precision


# Let's see what we get when we consider the top 5 ranking documents:
def print_precision_for_all_queries(k=5):
  for query_id, query in queries.items():
    TP, FP, precision = precision_at_k(query_id, k=k) 
    print(f'retrieved query "{query}" with precision @ {k}: {precision} (TP: {TP}, FP: {FP})')