In [1]:
from __future__ import unicode_literals
from numpy.core.numeric import NaN
from collections import Counter
from regex.regex import escape
import regex, copy, heapq, math, operator, random
import pandas as pd
import numpy as np
import sys,os,sys

In [2]:
# pip install hazm
from hazm import *

In [4]:
docs_links = pd.read_csv('IR00_3_11k News.csv')
unlabeled = pd.read_csv('IR_Spring2021_ph12_7k.csv')
docs_links = docs_links[docs_links['content'].notnull()]

documents = pd.read_csv('IR_Spring2021_ph12_7k.csv').iloc[:,1].values.tolist()
links = pd.read_csv('IR_Spring2021_ph12_7k.csv').iloc[:,2].values.tolist()

punctuations = pd.read_csv("punctuations.csv")
punctuations_to_remove = ''.join(punctuations["punctuations"].values.tolist()).replace(" ","")
stopwords_to_remove = ''.join(stopwords_list()).replace(" ","")

In [5]:
files = [pd.read_csv('IR00_3_11k News.csv'), pd.read_csv('IR00_3_17k News.csv'), pd.read_csv('IR00_3_20k News.csv')]
offset=int(files[0].tail(1)["id"])
files[1]["id"]=files[1]["id"]+offset
offset=int(files[1].tail(1)["id"])
files[2]["id"]=files[2]["id"]+offset

In [6]:
data=pd.read_csv('IR00_3_11k News.csv')
data["topic"].replace({"political": "politics", "sport": "sports"}, inplace=True)
data=pd.concat(files)
data.index=data.id
data = data[data['content'].notnull()]
docs_links = data 

## Tokenization

In [7]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()
N=len(docs_links["content"])
docs_len = {}

def tokenize(id, document):
    document = str(document)
    document = document.replace("انتهای پیام","")
    n_document = str(normalizer.normalize(document))
    #remove punctuations
    n_document = n_document.translate(str.maketrans('','',punctuations_to_remove))
    #remove stop words
    tokens = list(map(lambda t:lemmatizer.lemmatize(t),word_tokenize(n_document))) 
    if id is not None:
        docs_len[id] = len(tokens)
    dictionary = dict(Counter(tokens))
    for k, tf in dictionary.items():
        dictionary[k] = 1 + math.log10(tf)
    
    if id is not None:
        docs_len[id] = dictionary
    return dictionary

## Build Weighted Inverted Index

In [8]:
def build_weighted_inv_ind(docs_links):
    dictionary = {}
    for id, doc in zip(docs_links["id"], docs_links["content"]):
        counts = tokenize(id, doc)
        for k,v in counts.items():
            if len(k) < 3:
                continue
            if k in dictionary.keys():
                dictionary[k][0] += 1
                dictionary[k][1][id] = v
            else:
                dictionary[k]=[1,{id:v}]
    # calc idfs 
    for k in dictionary.keys():
        dictionary[k][0] = math.log10(N/dictionary[k][0])
    
    for id, dl in docs_len.items():
        size_d = 0
        for t, w in dl.items():
            if t in dictionary:
                size_d += dictionary[t][0]*w
        docs_len[id] = math.sqrt(size_d)
    return dictionary

In [9]:
weighted_inv_ind = build_weighted_inv_ind(docs_links)

## Build document term frequency

In [10]:
def build_doc_term_feq(docs_links):
    dictionary = {}
    for id, doc in zip(docs_links["id"], docs_links["content"]):
        counts = tokenize(None, doc)
        dictionary[id] = counts
    return dictionary

In [11]:
unlabeled = build_doc_term_feq(unlabeled)

## k-nearest neighbors (knn) classification

In [12]:
N=len(docs_links["content"])
def do_knn(doc_toks, weighted_inv_ind, k=5):
    similarities = {}
    for t,v in doc_toks.items():
        if t not in weighted_inv_ind:
            continue
        idf = weighted_inv_ind[t][0]
        for docid, w in weighted_inv_ind[t][1].items():
            if docid in similarities.keys():
                similarities[docid] += idf*w*v
            else:
                similarities[docid] = idf*w*v
    for doc_id, sim in similarities.items():
        similarities[doc_id] = sim/docs_len[doc_id]
    
    scores = similarities
    if len(scores) < k:
        k = len(scores)
    best_scores =[]
    heap = []
    for docid,score in scores.items():
        heapq.heappush(heap,(-score, docid))
    for _ in range(k):
        best_scores.append(heapq.heappop(heap))
    cats = []
    for doc in best_scores:
        doc_id = doc[1]
        cat = docs_links.loc[docs_links.index[docs_links['id'] == doc_id].tolist()[0]]["topic"]
        cats.append(cat)
    if len(cats) == 0:
        return 'none'
    return max(set(cats), key=cats.count)

In [13]:
labeled = {}
c = 0
for id, doc in unlabeled.items():
    labeled[id] = do_knn(doc, weighted_inv_ind)

In [14]:
len(docs_links)

49856

In [15]:
rplc = {"political": "politics", "sport": "sports"}
labeled = {k: rplc.get(v, v) for k, v in labeled.items()}
cats = Counter(labeled.values())
cats

Counter({'sports': 1664,
         'health': 1474,
         'culture': 147,
         'politics': 1558,
         'economy': 2153,
         'none': 4})

In [16]:
docs_links.loc[docs_links.index[docs_links['id'] == 5412].tolist()[0]]["topic"]

'economy'

# Query

In [17]:
import copy
unlabeled = pd.read_csv('IR_Spring2021_ph12_7k.csv')
df = copy.deepcopy(unlabeled)
df['topic'] = df['id'].map(labeled)
df.to_csv('IR_Spring2021_ph12_7k_labeled.csv', index=False)

In [18]:
df = pd.DataFrame.from_dict(labeled.items())
df.columns = ['id', 'topic']
df.to_csv('IR_Spring2021_ph12_7k_labeled.csv', index=False)

## Compute Similarity

In [19]:
N=len(docs_links["content"])
def compute_similarity(nd_q, weighted_inv_ind, picked_docs):
    similarities = {}
    for k,v in nd_q.items():
        if k not in weighted_inv_ind:
            continue
        idf = weighted_inv_ind[k][0]
        for docid, w in weighted_inv_ind[k][1].items():
            if docid not in picked_docs:
                continue
            if docid in similarities.keys():
                similarities[docid] += idf*w*v
            else:
                similarities[docid] = idf*w*v
    for doc_id, sim in similarities.items():
        similarities[doc_id] = sim/docs_len[doc_id]
    return similarities

## Get the best scores by calculating the most similar indexes to the query

In [20]:
def get_k_most_sim(query, weighted_inv_ind, picked_docs, k=5):
    scores = compute_similarity(tokenize(None, query), weighted_inv_ind, picked_docs)
    best_scores =[]
    heap = []
    for docid,score in scores.items():
        heapq.heappush(heap,(-score, docid))
    if len(heap) < k:
        k = len(heap)
    for _ in range(k):
        best_scores.append(heapq.heappop(heap))
    return best_scores

In [21]:
import re
def answer_query(query, df, k):
    q_toks = re.split(' ', query)
    topic = q_toks[0][4:]
    filtered_df = df[df['topic'] == topic]
    picked_docs = filtered_df['id'].tolist()
    
    answer = get_k_most_sim(query, weighted_inv_ind, picked_docs, k)
    return answer

In [25]:
def get_links(top_k, docs_links):
    results=[]
    for i in top_k:
        results.append(docs_links.loc[docs_links.index[docs_links['id'] == i[1]].tolist()[0]]["url"])
    return results

In [26]:
df = pd.read_csv('IR_Spring2021_ph12_7k_labeled.csv')

## Examples:

In [27]:
top_k = answer_query('cat:sports استقلال', df,5)
top_k

[(-0.3262301921084849, 915),
 (-0.30892837434627746, 1480),
 (-0.3065800814325119, 801),
 (-0.30161753103948996, 418),
 (-0.29989379038081343, 920)]

In [28]:
get_links(top_k, docs_links)

['https://www.isna.ir/news/99111309632/شکایت-استقلال-در-کمیته-انضباطی-فدراسیون-کشتی-بررسی-می-شود',
 'https://www.isna.ir/news/98090805350/حضور-فتحی-در-اردوی-استقلال-پیش-از-بازی-با-سپاهان-آبی-ها-پاداش',
 'https://www.isna.ir/news/99100805979/استقلال-وارد-لیگ-برتر-تکواندو-شد',
 'https://www.isna.ir/news/99060605427/جلسه-فوری-کادر-مدیریتی-استقلال-برای-استعفای-مجیدی',
 'https://www.isna.ir/news/99111511845/پنجره-نقل-و-انتقالات-باشگاه-استقلال-بسته-شد']

In [29]:
labeled[75]

'economy'

In [30]:
top_k = answer_query('cat:health کرونا', df,5)
get_links(top_k, docs_links)

['https://www.isna.ir/news/99082315432/واکسن-های-ایرانی-کرونا-در-فهرست-کاندیداهای-واکسن-سازمان-بهداشت',
 'https://www.isna.ir/news/99041108441/تعطیلی-بازارهای-سرپوشیده-در-۴-شهرستان-استان-بوشهر',
 'https://www.isna.ir/news/99010200782/تکذیب-یک-ادعا-درباره-جانباختگان-کرونا-در-ایران',
 'https://www.isna.ir/news/99092821716/کاهش-موارد-بستری-به-۷۵-بیمار-ثبت-یک-روز-بدون-فوتی-کرونا',
 'https://www.isna.ir/news/99050100731/اعلام-اسامی-مراکز-غربالگری-کرونا-در-استان-تهران']

In [37]:
top_k = answer_query('cat:politics کرونا', df,5)
get_links(top_k, docs_links)

['https://www.isna.ir/news/99061309882/تکذیب-شایعه-ابتلای-آیت-الله-جنتی-به-کرونا',
 'https://www.isna.ir/news/99050806366/لاریجانی-دوباره-کرونا-گرفت',
 'https://www.isna.ir/news/99091108977/چنارانی-و-دلخوش-به-کرونا-مبتلا-شدند',
 'https://www.isna.ir/news/99011608234/محمدرضا-خاتمی-از-بیمارستان-مرخص-شد',
 'https://www.isna.ir/news/99041410245/زاهدی-از-بیمارستان-مرخص-شد']

In [39]:
top_k = answer_query('cat:economy کرونا', df,5)
get_links(top_k, docs_links)

['https://www.isna.ir/news/99010703339/تزریق-بیش-از-۳۸۳۳-میلیارد-تومان-دیگر-توسط-دولت-به-نظام-سلامت',
 'https://www.isna.ir/news/99021410064/کرونا-پشت-ثروتمندترین-کشورهای-دنیا-را-خم-کرده-است',
 'https://www.isna.ir/news/99022013695/کاهش-۵۰-درصدی-حقوق-کارمندان-اتحادیه-جهانی-کشتی',
 'https://www.isna.ir/news/99031911937/بازگشت-جمعی-از-ایرانیان-مقیم-قزاقستان-با-پرواز-فوق-العاده-به',
 'https://www.isna.ir/news/99060906501/چین-نخستین-اقتصادی-که-از-کرونا-عبور-کرد']