<font face="微软雅黑" size=6 color=#000000> Import Data

In [5]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from tqdm import tqdm_notebook as tqdm

import json
import random
import re
import os
import sys
import math
import pickle
import csv

import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer  
from nltk.stem import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from collections import Counter
from string import digits
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cengqiqi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cengqiqi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cengqiqi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# import claims
def load_dataset_json(path, instance_num=1e6):
    """
    Reads the Fever Training set, returns list of examples.
    instance_num: how many examples to load. Useful for debugging.
    """
    data = []
    with open(path, 'r') as openfile:
        for iline, line in enumerate(openfile.readlines()):
            data.append(json.loads(line))
            if iline+1 >= instance_num:
                break
    return data
path_windows = "N:\\DesktopSettings\\Desktop\\DM_working\\dataset\\train.jsonl"
path_mac = "/Users/cengqiqi/Desktop/DM_working/dataset/train.jsonl"
dataset = load_dataset_json(path=path_mac, instance_num=20)

# get first 10 verifiable claims
# [75397, 150448, 214861, 156709, 129629, 33078, 6744, 226034, 40190, 76253].

claims_10 = []
for i in dataset:
    if i['verifiable'] == 'VERIFIABLE':
        claims_10.append(i)
claims_10 = claims_10[0:10]

In [3]:
# load docs
with open('inverted_word_dictionary.txt', 'rb') as handle:
    inverted_word_dictionary = pickle.loads(handle.read())
    
with open("doc_length_list.txt", "rb") as fp:   # Unpickling
    doc_length_list = pickle.load(fp)
    
# load doc
path_windows = "N:\\DesktopSettings\\Desktop\\DM_working\\dataset\\wiki_id_text"
path_mac = "/Users/cengqiqi/Desktop/DM_working/dataset/wiki_id_text"
dataset_wikipage = pd.read_table(path_mac,header = None)
wikipage = dataset_wikipage

<font face="微软雅黑" size=6 color=#000000> TF-IDF for claims

In [4]:
# process claims

# numpy
# nltk tokenizor
#nltk stemmer

####### for lemmatize #################
def get_pos(a_single_word):
    if a_single_word.startswith('J'):
        return wordnet.ADJ
    elif a_single_word.startswith('V'):
        return wordnet.VERB
    elif a_single_word.startswith('N'):
        return wordnet.NOUN
    elif a_single_word.startswith('R'):
        return wordnet.ADV
    else:
        return None

def claimProcess(words):
    #######  lower case  ############################################################
    words = words.lower()
    
    ####### tokenize ###############################################################
    pattern = r"""(?x)                  
                          (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
                          |\$?\d+(?:,\d+)*(?:\.\d+)?%? # 2,000 or 2.5
                          |\w+(?:[-']\w+)*      # words w/ optional internal hyphens/apostrophe  e.g. can't
                        """
    word_list = nltk.regexp_tokenize(words, pattern)


#     ####### lemmatize ################################################################
#     # toooooooooooo slow #############################################################
#     word_list = [WordNetLemmatizer().lemmatize(w,pos = get_pos(p) or wordnet.NOUN) for w, p in pos_tag(word_list)] 
    
    ####### remove stop words #######################################################
    stopwordlist = set(stopwords.words('english'))
    word_list = [w for w in word_list if w not in stopwordlist]
    
    
    ####### lemmatize ##############################################################
    #LancasterStemmer().stem('multiply')
    #WordNetLemmatizer().lemmatize('birds')
    #word_list = [WordNetLemmatizer().lemmatize(SnowballStemmer('english').stem(w),pos='v') for w in word_list]
    word_list = [WordNetLemmatizer().lemmatize(w,pos = 'v') for w in word_list] # only Lancaster
    

    return word_list 
    

In [6]:
def computeCosSimilarity(word_list, inverted_word_dictionary = inverted_word_dictionary, doc_length_list = doc_length_list):  
    """
    word_list is words list of a single claim, the example is shwoed below:
    
    word_list = claimProcess(claims_10[1]['claim'])
    """
    #build count dic for claim
    count_claims = dict(Counter(word_list))

    # tf for claim
    tf_total_d = len(word_list)
    tf_claims = {k:count_claims[k]/tf_total_d for k in count_claims.keys()}
    tf_claims_array = np.array(list(tf_claims.values()))
    
    #tf for docs
    tf_doc_list = []
    for i in range(len(doc_length_list)):
        tf_total_c = doc_length_list[i]
        tf_doc = {}
        for k in count_claims.keys():
            
            if tf_total_c:
                word_count_in_doc = inverted_word_dictionary[k]["doc" + str(i)] if "doc" + str(i) in inverted_word_dictionary[k] else 0
                tf_doc[k] = word_count_in_doc/tf_total_c
            else:
                tf_doc[k] = 0
                
        tf_doc_list.append(tf_doc)
    tf_doc_array = np.array([np.array(list(tf_doc_list[i].values())) for i in range(len(tf_doc_list))])

    #idf for both claims and docs   
    idf = {}
    for k in count_claims.keys():
        
        idf_total = len(doc_length_list)
        doc_count_include_k = len(inverted_word_dictionary[k])
        idf[k] = math.log(idf_total/(doc_count_include_k+1))
        idf_array = np.array(list(idf.values()))

    # tf-idf for claims
    tf_idf_claims_array = tf_claims_array*idf_array

    # tf_idf for docs
    tf_idf_docs_array = tf_doc_array*idf_array

    # cos similarity
    #cos_sim = [dot(tf_idf_claims_array, tf_idf_docs_array[i])/(norm(tf_idf_claims_array)*norm(tf_idf_docs_array[i])) for i in range(len(tf_idf_docs_array))]
    cos_sim = []
    for i in range(len(tf_idf_docs_array)):
        if norm(tf_idf_docs_array[i]) != 0:
            cos_sim.append(dot(tf_idf_claims_array, tf_idf_docs_array[i])/(norm(tf_idf_claims_array)*norm(tf_idf_docs_array[i])))
        else:
            cos_sim.append(0)
            
    return cos_sim

# # use the function for a single claim
# word_list = claimProcess(claims_10[5]['claim'])
# docs = wikipage
# cos_sim = computeCosSimilarity(word_list, docs)

In [7]:
# # get the top 5 doc id for a single claim
def getTop5(cos_sim, docs):
    #cos_sim.index(max(cos_sim))

    top_5_idx = np.argsort(cos_sim)[-5:][::-1]
    #top_5_values = [cos_sim[i] for i in top_5_idx]
    
    #return docs[0][top_5_idx]
    return top_5_idx

# getTop5(cos_sim, wikipage)

In [9]:
# # find the five most similar documents for ten claim and write them to csv
# for i in range(0,len(claims_10)):
#     docs = wikipage
#     word_list = claimProcess(claims_10[i]['claim'])
#     cos_sim = computeCosSimilarity(word_list)
#     doc_id = getTop5(cos_sim, wikipage)

#     with open('q2.csv', "a", newline='') as config_csv:
#         writer = csv.writer(config_csv)
#         writer.writerow([f"The five most similar documents for Claim {claims_10[i]['id']} \n"])
#         for top_id in doc_id:
#             writer.writerow([str(top_id), str(docs[0][top_id])])
#         writer.writerow(f"\n")

In [24]:
# find the five most similar documents for ten claim and write them to csv
    
with open('q2.csv', "a", newline='') as config_csv:
    list_save = ['claim id', 'doc id_1', 'doc id_2', 'doc id_3', 'doc id_4', 'doc id_5']
    
    writer = csv.writer(config_csv)
    writer.writerow(list_save)
    
for i in tqdm(range(0,len(claims_10))):
    
    word_list = claimProcess(claims_10[i]['claim'])
    cos_sim = computeCosSimilarity(word_list)
    doc_id = getTop5(cos_sim, wikipage)

    with open('q2.csv', "a", newline='') as config_csv:
        
        list_save = [claims_10[i]['id']]
        list_save.extend(wikipage[0][doc_id])
        
        writer = csv.writer(config_csv)
        writer.writerow(list_save)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))