## 資料檢索與文字探勘 R12725049-pa2

In [1]:
from platform import python_version

print(python_version())

3.11.4


### Construct a dictionary based on the terms extracted from the given documents.


In [2]:
import os
import math

# 資料夾路徑
folder_path = r'./data'

# 存儲文檔內容的列表
document_contents = []

# 列出資料夾中的所有文件
for filename in os.listdir(folder_path):
    # 文件路徑
    file_path = os.path.join(folder_path, filename)
    
    if os.path.isfile(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            document_contents.append(file.read())

In [3]:
print(document_contents[0])

the white house is also keeping a close watch on yugoslavia, where 
opposition forces are about to step up the pressure on president slobodan 
milosevic. but will it work? nbc's jim maceda is in belgrade tonight. 
serbia on the eve of a general strike. this two-hour roadblock is 
just a taste of what will come tomorrow, says the opposition, when 
a nationwide work stoppage is to begin. the purpose, to force out 
the man they say stole the presidential election, slobodan milosevic. 
the people don't accept to be victim and to be hostage of one person, 
milosevic. time is working against him now. but will the pressure 
work? four years ago, hundreds of thousands of serbs marched against 
milosevic calling for a general strike as well. the protest lasted 
three months, but milosevic survived. that's the real question. does 
the opposition have the power to confront the regime with its combined 
wealth to get rid of it? a case in point. these coal mine, the very 
heart of serbia's economy,

In [4]:
def tokenize_text(text):
    # empty List 用以儲存Tokens
    tokens = []
    
    # empty String 用以儲存單字
    current_token = ""
    
    separators = [' ', '.', ',', '!', '?', ';', ':', '_', '\"', '(', ')', '\'', '`','@','$','%','&','*','{','/']
    
    # 追蹤每個字母
    for char in text:
        # 如果字母是空格或標點符號，並且current string is not empty，則將其添加到tokens List
        if char.isspace()  or char in separators:
            if current_token:
                tokens.append(current_token)
                current_token = ""
        else:
            # 如果字母不是空格或標點符號，則將其添加到current token
            current_token += char
    
    # 將最後一個word添加到tokens列表中
    if current_token:
        tokens.append(current_token)
    
    return tokens

In [5]:
# 讀取stopwords
stopwords_file = open("NLTK's list of english stopwords.txt", "r")
stopwords = stopwords_file.read()
stopwords_list = stopwords.splitlines()
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [6]:
# import package
from nltk.stem.porter import PorterStemmer
import nltk

# stemmer
ps=PorterStemmer()

In [7]:
# Empty List for storing processed document
processed_texts = []

for document_content in document_contents:
    result = []
    # lower casting
    document_content = document_content.lower()
    # Tokenized
    tokenized_content = tokenize_text(document_content)
    # Stopwords removal
    filtered_tokens = [token for token in tokenized_content if token not in stopwords_list]
    # Stemming
    for t in filtered_tokens:  
        result.append(ps.stem(t))
    # 存入List
    processed_texts.append(result)

# 以文件一做測試
print(processed_texts[0])

['white', 'hous', 'also', 'keep', 'close', 'watch', 'yugoslavia', 'opposit', 'forc', 'step', 'pressur', 'presid', 'slobodan', 'milosev', 'work', 'nbc', 'jim', 'maceda', 'belgrad', 'tonight', 'serbia', 'eve', 'gener', 'strike', 'two-hour', 'roadblock', 'tast', 'come', 'tomorrow', 'say', 'opposit', 'nationwid', 'work', 'stoppag', 'begin', 'purpos', 'forc', 'man', 'say', 'stole', 'presidenti', 'elect', 'slobodan', 'milosev', 'peopl', 'accept', 'victim', 'hostag', 'one', 'person', 'milosev', 'time', 'work', 'pressur', 'work', 'four', 'year', 'ago', 'hundr', 'thousand', 'serb', 'march', 'milosev', 'call', 'gener', 'strike', 'well', 'protest', 'last', 'three', 'month', 'milosev', 'surviv', 'real', 'question', 'opposit', 'power', 'confront', 'regim', 'combin', 'wealth', 'get', 'rid', 'case', 'point', 'coal', 'mine', 'heart', 'serbia', 'economi', 'strike', 'hand', 'worker', 'guard', 'equip', 'week', 'electr', 'reserv', 'engin', 'worri', 'blackout', 're', 'make', 'gestur', 'tri', 'make', 'regim

In [8]:
# Empty list 用以儲存每個單字的index,term,frequency
term_info_list = []

# Empty dict for storing term_document_frequency
term_document_frequency = {}

# Loop start
for words in processed_texts:
    # 避免重複計算
    unique_words = set(words)
    
    # 更新Frequency
    for word in unique_words:
        if word in term_document_frequency:
            term_document_frequency[word] += 1
        else:
            term_document_frequency[word] = 1

# 按照字母順序排列字典
sorted_terms = sorted(term_document_frequency.items(), key=lambda x: x[0])

# 紀錄 index並存入 term_info_list 中
for index, (term, frequency) in enumerate(sorted_terms, start=1):
    term_info = {'index': index, 'term': term, 'frequency': frequency}
    term_info_list.append(term_info)

In [15]:
# 儲存字典為txt
output_file = "dictionary.txt"

with open(output_file, "w", encoding="utf-8") as file:
    # Header
    file.write("t_index   term                     df\n")
    
    for term_info in term_info_list:
        line = "{:<8}  {:<25}  {}\n".format(term_info['index'], term_info['term'], term_info['frequency'])
        file.write(line)

print(f"字典已保存到 {output_file}")

字典已保存到 dictionary.txt


### Transfer each document into a tf-idf unit vector.

In [10]:
# Empty List for storing each document
document_term_frequencies = []

# Loop start
for words in processed_texts:
    # Empty dict for storing document_term_frequency
    document_term_frequency = {}
    
    # 更新frequency
    for word in words:
        if word in document_term_frequency:
            document_term_frequency[word] += 1
        else:
            document_term_frequency[word] = 1
    
    # 存入個文件的 List
    document_term_frequencies.append(document_term_frequency)

# 計算文件總數
total_documents = len(processed_texts)

tfidf_vectors = []

for document_term_frequency in document_term_frequencies:
    tfidf_vector = []
            
    for word, term_frequency in document_term_frequency.items():
        # 計算 TF
        tf = term_frequency / sum(document_term_frequency.values())
        
        # 計算 IDF
        idf = math.log10(total_documents / term_document_frequency[word])
        
        # 計算 TF-IDF
        tfidf = tf * idf
        
        # 獲取 term_index
        term_index = next((item['index'] for item in term_info_list if item['term'] == word), None)
        
        # Add term_index and TF-IDF 值到向量
        tfidf_info = {'index': term_index, 'tf-idf': tfidf, 'tf' : tf, 'idf' : idf}
        tfidf_vector.append(tfidf_info)
    
    tfidf_vectors.append(tfidf_vector)

In [11]:
# 以隨機文件做測試
print(tfidf_vectors[597])

[{'index': 14237, 'tf-idf': 0.0009284568811649149, 'tf': 0.002638522427440633, 'idf': 0.35188515796150277}, {'index': 10994, 'tf-idf': 0.038085491179681734, 'tf': 0.044854881266490766, 'idf': 0.8490824210058456}, {'index': 3164, 'tf-idf': 0.0037924911025017803, 'tf': 0.002638522427440633, 'idf': 1.4373541278481747}, {'index': 9759, 'tf-idf': 0.014946330646736007, 'tf': 0.010554089709762533, 'idf': 1.4161648287782367}, {'index': 9526, 'tf-idf': 0.006760667188539511, 'tf': 0.002638522427440633, 'idf': 2.5622928644564746}, {'index': 11971, 'tf-idf': 0.009870889944679816, 'tf': 0.0079155672823219, 'idf': 1.2470224296778833}, {'index': 10152, 'tf-idf': 0.0056367391350506425, 'tf': 0.002638522427440633, 'idf': 2.1363241321841935}, {'index': 12706, 'tf-idf': 0.0026229317688376773, 'tf': 0.002638522427440633, 'idf': 0.9940911403894798}, {'index': 3889, 'tf-idf': 0.006760667188539511, 'tf': 0.002638522427440633, 'idf': 2.5622928644564746}, {'index': 4599, 'tf-idf': 0.005080397801765964, 'tf': 0

In [12]:
output_folder = "output"
os.makedirs(output_folder, exist_ok=True)

# 將每個文件的 tf-idf 儲存
for document_index, words in enumerate(processed_texts, start=1):
    # 文件名為 Docid.txt
    document_filename = f"{document_index}.txt"
    
    with open(os.path.join(output_folder, document_filename), "w", encoding="utf-8") as file:
        # Header
        file.write("t_index   tf-idf\n")
    
        for tfidf_info in tfidf_vectors[document_index - 1]:
            line = "{:<8}  {}\n".format(tfidf_info['index'], tfidf_info['tf-idf'])
            file.write(line)

### Write a function cosine(Docx, Docy) which loads the tf-idf vectors of documents x and y and returns their cosine similarity.

In [13]:
def cosine(vector_x, vector_y):
    # 計算向量內積
    dot_product = sum(x['tf-idf'] * y['tf-idf'] for x, y in zip(vector_x, vector_y))
    
    # 計算向量大小
    magnitude_x = math.sqrt(sum(x['tf-idf']**2 for x in vector_x))
    magnitude_y = math.sqrt(sum(y['tf-idf']**2 for y in vector_y))
    
    # Cosine similarity
    similarity = dot_product / (magnitude_x * magnitude_y)
    
    return similarity

def unit_vector(vector):
    # 計算向量大小
    magnitude = math.sqrt(sum(x['tf-idf']**2 for x in vector))
    
    # 計算單位向量
    unit_vector = [{'tf-idf': x['tf-idf'] / magnitude} for x in vector]
    
    return unit_vector

In [18]:
# 計算兩文件各自的單位向量
unit_vector_x = unit_vector(tfidf_vectors[154])
unit_vector_y = unit_vector(tfidf_vectors[58])

# 計算cosine similarity
similarity = cosine(unit_vector_x, unit_vector_y)

print(f"文件 x 和文件 y 的cosine similarity: {similarity}")

文件 x 和文件 y 的cosine similarity: 0.248133157335619
