In [1]:
from __future__ import unicode_literals
from numpy.core.numeric import NaN
from regex.regex import escape
from collections import Counter
import regex, copy, heapq, math, operator, random
import pandas as pd
import numpy as np
import sys,os,sys
import operator

In [2]:
# pip install hazm
from hazm import *

## Reading data

In [3]:
docs_links = pd.read_csv('IR00_3_11k News.csv')
docs_links = docs_links[docs_links['content'].notnull()]

punctuations = pd.read_csv("punctuations.csv")
punctuations_to_remove = ''.join(punctuations["punctuations"].values.tolist()).replace(" ","")
stopwords_to_remove = ''.join(stopwords_list()).replace(" ","")

In [4]:
files = [pd.read_csv('IR00_3_11k News.csv'), pd.read_csv('IR00_3_17k News.csv'), pd.read_csv('IR00_3_20k News.csv')]

In [5]:
offset=int(files[0].tail(1)["id"])
files[1]["id"]=files[1]["id"]+offset
offset=int(files[1].tail(1)["id"])
files[2]["id"]=files[2]["id"]+offset

In [6]:
data = pd.read_csv('IR00_3_11k News.csv')
data["topic"].replace({"political": "politics", "sport": "sports"}, inplace=True)
data = pd.concat(files)
data.index = data.id
data = data[data['content'].notnull()]
docs_links = data 

## Tokenization

In [7]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()
docs_len = {}

def tokenize(id, document):
    document = str(document)
    document = document.replace("انتهای پیام","")
    n_document = str(normalizer.normalize(document))
    #remove punctuations
    n_document = n_document.translate(str.maketrans('','',punctuations_to_remove))
    #remove stop words
    tokens = list(map(lambda t:lemmatizer.lemmatize(t),word_tokenize(n_document))) 
    if id is not None:
        docs_len[id] = len(tokens)

    dictionary = dict(Counter(tokens))
    for k, tf in dictionary.items():
        dictionary[k] = 1 + math.log10(tf)
    
    if id is not None:
        docs_len[id] = dictionary
    return dictionary

## Build Weighted Inverted Index

In [8]:
N=len(docs_links["content"])
def build_weighted_inv_ind(docs_links):
    dictionary = {}
    for id, doc in zip(docs_links["id"], docs_links["content"]):
        counts = tokenize(id, doc)
        for k,v in counts.items():
            if len(k) < 3:
                continue
            if k in dictionary.keys():
                dictionary[k][0] += 1
                dictionary[k][1][id] = v
            else:
                dictionary[k]=[1,{id:v}]
    # calc idfs 
    for k in dictionary.keys():
        dictionary[k][0] = math.log10(N/dictionary[k][0])
    
    for id, dl in docs_len.items():
        size_d = 0
        for t, w in dl.items():
            if t in dictionary:
                size_d += dictionary[t][0]*w
        docs_len[id] = math.sqrt(size_d)
    return dictionary

## Build Champion Lists

In [9]:
def build_champion_lists(weighted_inv_ind):
    champion_lists = {}
    for t, ws in weighted_inv_ind.items():
        champion_lists[t] = []
        champion_lists[t].append(ws[0])
        champion_lists[t].append(dict(sorted(ws[1].items(), key=operator.itemgetter(1), reverse=True)[:10]))
    return champion_lists


In [10]:
%%time
weighted_inv_ind = build_weighted_inv_ind(docs_links)

CPU times: user 56.1 s, sys: 634 ms, total: 56.7 s
Wall time: 56.7 s


## Build document term frequency

In [11]:
def build_doc_term_feq(docs_links):
    dictionary = {}
    for id, doc in zip(docs_links["id"], docs_links["content"]):
        counts = tokenize(None, doc)
        dictionary[id] = [sum(counts.values()), counts]
    return dictionary

In [12]:
doc_term_feq = build_doc_term_feq(docs_links)

## Compute Similarity

In [13]:
N=len(docs_links["content"])
def compute_similarity(nd_q, weighted_inv_ind):
    similarities = {}
    for k,v in nd_q.items():
        if k not in weighted_inv_ind:
            continue
        idf = weighted_inv_ind[k][0]
        for docid, w in weighted_inv_ind[k][1].items():
            if docid in similarities.keys():
                similarities[docid] += idf*w*v
            else:
                similarities[docid] = idf*w*v
    for doc_id, sim in similarities.items():
        similarities[doc_id] = sim/docs_len[doc_id]
    return similarities

## k-means clustering

In [14]:
def do_kmeans(doc_term_freq):
    sims_dict = {}
    initial_centers = []
    for _ in range(10):
        initial_centers.append(doc_term_freq[random.randint(1,N+1)][1])
    
    clusters = {}
    for _ in range(30):
        clusters = {}
        for i in range(len(initial_centers)):
            sims = compute_similarity(initial_centers[i], weighted_inv_ind)
            for doc_id, sim in sims.items():
                sims_dict[(i, doc_id)] = sim
        
        # clustering
        doc_cluster = {}
        for ids in sims_dict.keys():
            max_sim = -math.inf
            max_cen = None
            for i in range(len(initial_centers)):
                cen_id = i
                if (cen_id, ids[1]) in sims_dict:
                    if sims_dict[(cen_id, ids[1])] > max_sim:
                        max_sim = sims_dict[(cen_id, ids[1])]
                        max_cen = cen_id
            doc_cluster[ids[1]] = max_cen
        
        #clusters = {}
        for doc_id, cen_id in doc_cluster.items():
            if cen_id in clusters:
                clusters[cen_id].append(doc_id)
            else:
                clusters[cen_id] = [doc_id]

        # calc avg
        new_cens = []
        for cen_id, doc_ids in clusters.items():
            avg_tf = {}
            for doc_id in doc_ids:
                term_freq = doc_term_freq[doc_id][1]
                for t, f in term_freq.items():
                    if t in avg_tf:
                        avg_tf[t] += (1+math.log(f))/len(clusters[cen_id])
                    else:
                        avg_tf[t] = (1+math.log(f))/len(clusters[cen_id])
            new_cens.append(avg_tf)
        initial_centers = new_cens
    return initial_centers, clusters

In [15]:
import tqdm
def calc_RSS(centers, clusters, center_len):
    rss_l = [0 for i in range(len(centers))]
    for k,v in clusters.items():
        for i in tqdm(v):
            for word in set(centers[k].keys()).union(set(docs_links[i].keys())):
                rss_l[k] += abs(centers.get(word,0) - docs_links[i].get(word,0))**2
        if center_len[k]!=0:
            rss_l[k] /= center_len[k]
    return sum(rss_l)

In [16]:
centers, clusters = do_kmeans(doc_term_feq)

In [17]:
clusters.keys()

dict_keys([0, 1, 2, 3, 5, 6, 7, 8, 9, 4])

In [18]:
for k, v in clusters.items():
    print(len(v))

10124
26106
2386
4900
2177
314
1306
994
76
1150


# Query

In [19]:
N=len(docs_links["content"])
def compute_similarity(nd_q, weighted_inv_ind):
    similarities = {}
    for k,v in nd_q.items():
        if k not in weighted_inv_ind:
            continue
        idf = math.log(N/weighted_inv_ind[k][0])
        for docid, w in weighted_inv_ind[k][1].items():
            if docid in similarities.keys():
                similarities[docid] += idf*w*v
            else:
                similarities[docid] = idf*w*v
    for doc_id, sim in similarities.items():
        similarities[doc_id] = sim/docs_len[doc_id]
    return similarities

In [20]:
def pick_center(nd_q, centers, weighted_inv_ind):
    similarities = {}
    for k,v in nd_q.items():
        if k not in weighted_inv_ind:
            continue
        idf = weighted_inv_ind[k][0]
        for i in range(len(centers)):
            if k in centers[i]:
                if i in similarities:
                    similarities[i] += idf*centers[i][k]*v
                else:
                    similarities[i] = idf*centers[i][k]*v
    return max(similarities.items(), key=operator.itemgetter(1))[0]



In [34]:
def get_k_most_sim(query, clusters, doc_term_feq, weighted_inv_ind, k=5):
    q_tokens = tokenize(None, query)
    picked_center = pick_center(q_tokens, centers, weighted_inv_ind)
    sims = {}
    for t, v in q_tokens.items():
        if t not in weighted_inv_ind:
            continue
        idf = weighted_inv_ind[t][0]
        for doc_id in clusters[str(picked_center)]:
            #print(doc_id)
            doc_terms = doc_term_feq[doc_id][1]
            #print(doc_terms)
            if t in doc_terms:
                if doc_id in sims:
                    sims[doc_id] += idf*doc_terms[t]*v
                else:
                    sims[doc_id] = idf*doc_terms[t]*v
    scores = sims
    if len(scores) < k:
        k = len(scores)
    best_scores =[]
    heap = []
    for docid,score in scores.items():
        heapq.heappush(heap,(-score, docid))
    for _ in range(k):
        best_scores.append(heapq.heappop(heap))
    return best_scores

In [35]:
top_k = get_k_most_sim("اعزام کاروان های اردوی راهیان نور", clusters, doc_term_feq, weighted_inv_ind, 5)
top_k

<class 'int'>


[(-14.58628306529145, 24911),
 (-13.949702708906685, 10090),
 (-13.23003045700133, 10031),
 (-13.23003045700133, 10043),
 (-12.036480758470455, 10692)]

In [36]:
def get_links(top_k, docs_links):
    results=[]
    for i in top_k:
        results.append(docs_links.loc[docs_links.index[docs_links['id'] == i[1]].tolist()[0]]["url"])
    return results

In [37]:
get_links(top_k, docs_links)

['https://www.farsnews.ir/news/13991215000577/تکلیف-اردوهای-راهیان-نور-در-دومین-نوروز-کرونایی-چه-می\u200cشود',
 'https://www.isna.ir/news/98043015977/تشریح-جزئیات-حضور-مردم-اردبیل-در-یادمان-های-شمال-غرب-کشور',
 'https://www.isna.ir/news/98041910370/خوزستانی-ها-به-مناطق-عملیاتی-غرب-کشور-می-روند',
 'https://www.isna.ir/news/98041910370/خوزستانی-ها-به-مناطق-عملیاتی-غرب-کشور-می-روند',
 'https://www.isna.ir/news/98081408553/اعلام-جزئیات-حضور-دانش-آموزان-گیلانی-در-اردوهای-راهیان-نور']

In [38]:
import json
a_file = open("clusters.json", "w")
a_file = json.dump(clusters, a_file)

In [39]:
a_file = open("clusters.json", "r")
clusters = json.load(a_file)
clusters

for k, v in clusters.items():
    print(len(v))

10124
26106
2386
4900
2177
314
1306
994
76
1150


In [40]:
top_k = get_k_most_sim("سرمایه گذاری در بازار بورس", clusters, doc_term_feq, weighted_inv_ind, 5)
get_links(top_k, docs_links)

<class 'int'>


['https://www.farsnews.ir/news/13991105000917/دولت-بازار-سرمایه-را-سیاسی-کرد-فروش-سهام-عدالت-از-سوی-کارگزاری\u200cها-و',
 'https://www.farsnews.ir/news/13990224001102/تامین-مالی-000-میلیارد-تومانی-در-بورس-تهران-مردم-در-2000-نقطه-به-بازار',
 'https://www.farsnews.ir/news/13990916000390/سازمان-بورس-آماده-برخورد-سریع-و-قاطع-با-متخلفان-تضعیف-مقام-ناظر-بدعتی',
 'https://www.farsnews.ir/news/13991217000737/بورس-ایران-تحلیل-پذیری-خود-را-از-دست-داده-است-ریسک-رمزارزها-برای',
 'https://www.farsnews.ir/news/13990528000138/ورود-0-هزار-میلیارد-تومان-نقدینگی-تازه-واردها-به-بازار-سرمایه-مسئولیت']