

---

**IMPORTS**

---



In [1]:
import math
import nltk
import glob
import numpy as np
import pandas as pd
import re, string, unicodedata
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True



---

**READING DATASET**

---



In [2]:
json_url = 'https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl'

dataset = pd.read_json(json_url, lines=True)
dataset = dataset.fillna(0)

id_list = dataset["id"].values
text_list = dataset["claim"].values

dataset.head()

Unnamed: 0,id,verifiable,label,claim,evidence
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, Nikolaj_Coster-Waldau, 7], [..."
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, Roman_Atwood, 1]], [[174271..."
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, History_of_art, 2]]]"
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, Adrienne_Bailon, 0]]]"
4,83235,NOT VERIFIABLE,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,"[[[100277, None, None, None]]]"




---

**TEXT PREPROCESSING**

---



In [3]:
def normalize(text):
    res = text.lower()
    res = re.sub(r'\d+', '', res)
    res = unicodedata.normalize('NFKD', res).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    res = re.sub(r'[^\w\s]', '', res)
    res = res.strip()
    return res
     
def tokenize(text):
    return nltk.word_tokenize(text)

def lemmatization(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(word) for word in tokens]

def remove_stop_word(tokens):
    stop_words = set(stopwords.words('english'))
    return [i for i in tokens if not i in stop_words]

def preprocess(text):
    tokens = tokenize(normalize(text))
    tokens = lemmatization(tokens)
    tokens = remove_stop_word(tokens)
    return tokens



---

**BUILDING INDEX**

---



In [4]:
def get_collection(doc):

    coll = []
    for d in doc:
      coll.append(preprocess(d))

    return coll

In [5]:
def build_ldoc(coll):
  return {k:len(v) for k, v in zip(range(len(coll)), coll)}

def build_index(coll):
  res = dict()

  for i, v in zip(range(len(coll)), coll):
    for w in v:
      num = len(list(filter(lambda a: a == w, v)))
      if w in res.keys():
        if len(list(filter(lambda a: a[0] == i, res[w][1:]))) == 0:
          res[w][0] += num
          res[w] += [(i, num)] 
      else:
        res[w] = [num, (i, num)]

  return res

In [6]:
collection = get_collection(text_list.tolist()[:500])
index = build_index(collection)
doc_lengths = {k:len(v) for k, v in zip(range(len(collection)), collection)}

print(index)
print(doc_lengths)

{'nikolaj': [1, (0, 1)], 'costerwaldau': [1, (0, 1)], 'worked': [7, (0, 1), (24, 1), (96, 1), (254, 1), (277, 1), (355, 1), (457, 1)], 'fox': [1, (0, 1)], 'broadcasting': [1, (0, 1)], 'company': [2, (0, 1), (31, 1)], 'roman': [1, (1, 1)], 'atwood': [1, (1, 1)], 'content': [1, (1, 1)], 'creator': [2, (1, 1), (301, 1)], 'history': [3, (2, 1), (141, 1), (422, 1)], 'art': [4, (2, 2), (106, 1), (141, 1)], 'includes': [9, (2, 1), (89, 1), (98, 1), (146, 1), (148, 1), (187, 1), (267, 1), (290, 1), (492, 1)], 'architecture': [1, (2, 1)], 'dance': [2, (2, 1), (33, 1)], 'sculpture': [1, (2, 1)], 'music': [4, (2, 1), (20, 1), (350, 1), (375, 1)], 'painting': [1, (2, 1)], 'poetry': [1, (2, 1)], 'literature': [1, (2, 1)], 'theatre': [1, (2, 1)], 'narrative': [2, (2, 1), (345, 1)], 'film': [36, (2, 1), (9, 1), (18, 1), (25, 1), (26, 1), (38, 1), (50, 1), (71, 1), (101, 2), (133, 1), (152, 1), (154, 1), (179, 1), (180, 1), (185, 1), (203, 1), (244, 1), (254, 1), (258, 1), (277, 1), (280, 1), (292, 1)



---

**TDM**

---



In [7]:
def tdm(ldocs, index, tindex):
  res = np.zeros((len(ldocs.keys()), len(tindex.keys())), np.float64)
  for i in range(len(ldocs.keys())):
    for j in range(len(tindex.keys())):
      __fres = list(filter(lambda a: a[0] == i, index[tindex[j]][1:]))
      if __fres == []:
        res[i,j] = 0
      else:
        res[i,j] = __fres[0][1]
  return res

In [8]:
ldocs = build_ldoc(collection)
index = build_index(collection)
tindex = {i:v for i, v in enumerate(index.keys())}

A = tdm(ldocs, index, tindex)



---

**PCA & NORMALIZATION**

---



In [9]:
def norm_vectors(A):
    An = A.copy()
    An = An / np.linalg.norm(An)

    return An

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=308)
pca.fit(A)
Ak = pca.transform(A)
print('<> Total variance: ', str(np.sum(np.array(pca.explained_variance_ratio_)) * 100) + ' %')
An = norm_vectors(Ak)

<> Total variance:  85.00195613516554 %




---

**QUERY**

---



In [11]:
def find_k_closest(query, dataset, k=10):    
    query = query.reshape(1, -1)
    res = cosine_similarity(query, dataset)[0]
    res = zip(range(dataset.shape[0]), ['' for _ in range(dataset.shape[0])], res)
    res = sorted(res, reverse=True, key=(lambda a: a[2]))
    return res[:k]


def query(q, index, An):
  _res = []
  for dc, _ in index[q][1:]:
    r = find_k_closest(An[dc,:], An)
    for k, v, p in r:
      if p < 1 or p == 1.0:
        _res += [(k, p)]
  v, _ = list(zip(*_res))
  v = set(v)
  res = []
  for i in v:
    _, _p = list(zip(*list(filter(lambda a: a[0] == i, _res))))
    res += [(i, max(_p))]

  return res


def multi_word_queries(q, index, An):
  res_list = []
  dres = {}

  for x in q:
    res = None
    try:
      res = sorted(query(x, index, An), key=(lambda a: a[1]), reverse=True)
    except KeyError:
      continue
    dres.update(res)
    res = list(map(lambda a:a[0], res))
    res_list.append(res)

  flatten = lambda t: [item for sublist in t for item in sublist]
  res_list = list(map(lambda a: set(a), res_list))

  if len(res_list) > 1:
    res = res_list[0]
    for r in range(1, len(res_list)):
      res = res.intersection(res_list[r])
  elif len(res_list) == 1:
    res = res_list[0]
  else:
    return 'No results found!'

  sres = set(flatten(res_list))
  sres =  sres.difference(res)
  res = list(res)
  sres = sorted(list(map(lambda a: (a, dres[a]), sres)), key=(lambda a:a[1]), reverse=True)

  while len(res) != 10:
    if len(sres) == 0:
      break
    res += [sres[0][0]]
    sres = sres[1:]

  res = sorted(list(map(lambda a: (a, dres[a]), res)), key=(lambda a:a[1]), reverse=True)
  res = list(map(lambda a: a[0], res))

  return ('\n'.join(list(map(lambda a: '{}: {}'.format(' '.join(collection[a]), dres[a]), res))))

In [12]:
q = 'Tom Hanks'
q = preprocess(q)

print('Text: similarity\n')
result = multi_word_queries(q, index, An)
print(result)

Text: similarity

sully tom hank movie: 0.9999999999999998
sarrainodu action movie: 0.6134535368556278
sully form medium: 0.5400816125954071
tom brady played cow: 0.4521744545335853
galaxy quest movie american: 0.44011518786919485
thor dark world movie: 0.43359157861363795
paul walker acted movie: 0.3953290533214157
red headed stranger movie: 0.38610999902722
movie called hunger game: 0.3508479562092131
joe manganiello ha appeared movie: 0.34808585471835585
