# import library

In [7]:
from nltk.stem.porter import PorterStemmer
import os
import math
import pandas as pd
import numpy as np
import pickle

# read data

In [10]:
# with open("science_X.pkl", "rb") as f:
#     science_x = pickle.load(f)
# with open("science_y.pkl", "rb") as f:
#     science_y = pickle.load(f)
with open("social_X.pkl", "rb") as f:
    social_x = pickle.load(f)
with open("social_y.pkl", "rb") as f:
    social_y = pickle.load(f)

In [12]:
social_x

[['這項',
  '制度',
  '分成',
  '『',
  '禮',
  '』',
  '和',
  '『',
  '樂',
  '』',
  '兩個',
  '部分',
  '前者',
  '主要',
  '從',
  '形式',
  '上',
  '規範人',
  '的',
  '行為',
  '舉止',
  '指定',
  '每種',
  '身分',
  '應',
  '履行',
  '的',
  '禮儀',
  '和',
  '義務',
  '最終',
  '形成',
  '等級',
  '制度',
  '後者則',
  '是',
  '經由',
  '制定',
  '典範的詩',
  '歌曲',
  '目和舉行',
  '集體',
  '奏樂',
  '等',
  '活動',
  '方式',
  '締造統',
  '一和諧',
  '的',
  '社會',
  '氛圍',
  '請問',
  '以上',
  '所',
  '描述',
  '的',
  '是',
  '何種',
  '制度',
  '封建制度',
  '禮樂',
  '制度',
  '推舉',
  '孝廉',
  '九品',
  '官人',
  '之',
  '法'],
 ['春秋',
  '時期',
  '的',
  '哲人',
  '孔子',
  '曾經',
  '在',
  '回答',
  '學生',
  '的',
  '問題',
  '時',
  '說',
  '克己',
  '復',
  '禮為',
  '仁',
  '一日',
  '克己',
  '復',
  '禮',
  '天下',
  '歸仁',
  '焉',
  '其',
  '大意',
  '是',
  '指',
  '只要',
  '能',
  '克制',
  '自己',
  '的',
  '私欲',
  '使',
  '言行',
  '舉止',
  '合乎',
  '禮節',
  '就是',
  '實踐',
  '仁道',
  '的',
  '善政由',
  '這點',
  '來',
  '看',
  '孔子',
  '其實',
  '是',
  '希望',
  '恢',
  '復',
  '周朝',
  '固有',
  '的',
  '何種',
  '制度',
  '禮樂',
  '制度',
  '郡',
 

# define data to token

In [38]:
def dataPreProcessing(path, stopWords):
    
    # read data
    f = open(path, "r")
    words = f.read()
    
    # tokens to be save(tokenization)
    tokens_byWordTokenize = []

    # string to be remembered
    word = ''

    # delete line breaks
    words = words.replace('\n', '')

    # loop that judge where the word should be cut
    for letter in words:
        if letter in " .,!?'":
            if word != '':
                tokens_byWordTokenize.append(word)
                word = ''
        else:
            word += letter
    
    # define punclist
    puncList = "'" + '"#$&()%:;-*/`_@{}0123456789'
    
    for i in range(0, len(tokens_byWordTokenize)):
        for puncWord in puncList:
            tokens_byWordTokenize[i] = tokens_byWordTokenize[i].replace(puncWord, '')
            
    # delete empty token
    tokens_byWordTokenize = filter(lambda a: a != '', tokens_byWordTokenize)
        
    # convert all letters to lowercase
    tokens_byLower = list(map(lambda x:x.lower(), tokens_byWordTokenize))
    
    # using PorterStemmer() supplied by nltk 
    stemmer = PorterStemmer()
    
    # convert all letters by stemmer
    tokens_stemmed = list(map(lambda x:stemmer.stem(x), tokens_byLower))
    
    # reserve tokens those aren't not stopwords
    tokens_withoutStopwords = [token for token in tokens_stemmed if token not in stopWords]
    
    return tokens_withoutStopwords

# read data

In [39]:
dataDir = "data"
articles = []
txtList = []
for txtDoc in os.listdir(dataDir):
    path = dataDir + "/" + txtDoc
    txtList.append(txtDoc)
    articles.append(dataPreProcessing(path, stopWords))

# calculate df and record dictionary

In [40]:
# initialize dictionary
dictionary = {}

# for each article
for i in range(len(articles)):
    
    # initialize term list for article
    termList = articles[i]
    
    # initiate bool dictionary
    boolDictionary = dictionary.copy()
    for term, val in boolDictionary.items():
        if val:
            boolDictionary[term] = False
    
    # each term
    for term in termList:
        if term not in dictionary:
            dictionary[term] = 1
            boolDictionary[term] = True
        else:
            if boolDictionary[term] == False:
                dictionary[term] += 1
                boolDictionary[term] = True

# save dictionary.txt

In [41]:
# save to dictionary.txt
f1 = open('dictionary.txt', 'w')
f1.write("{:<15} {:<15} {:<15}\n".format("t_index", "term", "df"))
indexDictionary = {}
count = 0
for k in sorted(dictionary.keys()):
    count += 1
    f1.write("{:<15} {:<15} {:<15}\n".format(count, k, dictionary[k]))
    indexDictionary[k] = count
    
f1.close()

# count tf

In [42]:
dictList = []

for i in range(len(articles)):
    
    artDictionary = {}
    termList = articles[i]
    for term in termList:
        if term not in artDictionary:
            artDictionary[term] = 1
        else:
            artDictionary[term] += 1
    
    dictList.append(artDictionary)

# export vector file

In [43]:
number = len(dictList)
for i in range(number):
    
    tfidfDictionary = {}
    tfDictionary = dictList[i]
    for k in tfDictionary.keys():
        tf = tfDictionary[k]
        df = dictionary[k]
        idf = math.log10(number/df)
        tfidfDictionary[k] = tf*idf
    
    v = float(np.linalg.norm(list(tfDictionary.values())))
    for k in tfDictionary.keys():
        tfidfDictionary[k] = round(tfidfDictionary[k]/v, 3)
    
    name = "output/" + txtList[i]
    f = open(name, 'w')
    f.write("{:<15}\n".format(len(tfidfDictionary)))
    f.write("{:<15} {:<15}\n".format("t_index", "tf-idf"))
    for k in sorted(tfidfDictionary.keys()):
        f.write("{:<15} {:<15}\n".format(indexDictionary[k], tfidfDictionary[k]))
    f.close()

# define cosine function

In [44]:
def cosine(Docx, Docy):
    doc1 = pd.read_table(Docx, header = None, sep='\s+', skiprows=[0,1])
    doc2 = pd.read_table(Docy, header = None, sep='\s+', skiprows=[0,1])
    v1 = np.array(doc1[1])
    v2 = np.array(doc2[1])
    y1 = float(np.linalg.norm(v1, keepdims=True))
    y2 = float(np.linalg.norm(v2, keepdims=True))
    dic1 = {}
    dic2 = {}
    for i in range(0, len(doc1)):
        dic1[str(doc1[0][i])] = doc1[1][i]
    for i in range(0, len(doc2)):
        dic2[str(doc2[0][i])] = doc2[1][i]
    summ = 0
    for k in dic1.keys():
        if k in dic2.keys():
            summ += dic1[k]*dic2[k]
    summ = summ / (y1*y2)
    
    return round(summ, 3)

# cosine similarity between document 1 and 2

In [45]:
Doc1 = "output/1.txt"
Doc2 = "output/2.txt"
print(cosine(Doc1, Doc2))

0.2
