In [1]:
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Đọc file dữ liệu
def read_file(filename):	
    file_store=os.getcwd() + "\\Data"
    filename=os.path.join(file_store, filename)        
    with open(filename, 'r',encoding="utf-8") as f:
        data = f.read()
    return data

# Chuẩn hóa dữ liệu
def create_dataframe(matrix, tokens):
    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)
    
# Tiền xử lý dữ liệu    
def Text_Preprocessing(doc):
    text_pre=doc.lower()
    text_pre=re.sub(r'[^\w\s]','',text_pre)
    text_pre=re.sub("\d+", " ", text_pre)

    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text_pre)
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    text_pre = ' '.join(words)

    return text_pre

In [3]:
# Define the Data
doc_1=read_file("doc1.txt")	#Crawl from: https://www.nytimes.com/2023/03/31/technology/chatgpt-italy-ban.html
doc_2=read_file("doc2.txt") #Crawl from: https://www.abc.net.au/news/2023-04-01/chatgpt-ai-chatbot-blocked-itay-over-privacy-concerns/102175640
doc_3=read_file("doc3.txt") #Crawl from: https://en.wikipedia.org/wiki/Manchester_United_F.C.

doc_1=Text_Preprocessing(doc_1)
doc_2=Text_Preprocessing(doc_2)
doc_3=Text_Preprocessing(doc_3)

data = [doc_1, doc_2, doc_3]
data

['chatgpt banned italy privacy concern action italy data protection agency first known instance chatbots blocked government order italy data protection authority said openai unlawfully collected personal data user ageverification system place prevent minor exposed illicit materialcreditsusan wright new york time artificial intelligence tool chatgpt temporarily banned italy friday first known instance chatbot blocked government order italy data protection authority said openai california company make chatgpt unlawfully collected personal data user ageverification system place prevent minor exposed illicit material italy first government ban chatgpt result privacy concern china north korea russia iran service unavailable openai decided make accessible italy decision sign policy challenge emerging developer cuttingedge ai release chatgpt program dazzled user ability draft essay engage humanlike conversation perform complex task like writing computer code raised alarm spread misinformation

In [4]:
# Xây dựng vector TF-IDF
CountVect = CountVectorizer()
vector_matrix = CountVect.fit_transform(data)
vector_matrix.shape

(3, 594)

In [5]:
tokens = CountVect. get_feature_names_out()
create_dataframe(vector_matrix.toarray(),tokens)

# Tính toán độ tương đồng
cosine_similarity_matrix = cosine_similarity(vector_matrix)
Similarity=create_dataframe(cosine_similarity_matrix,['doc_1','doc_2','doc_3'])

Similarity

Unnamed: 0,doc_1,doc_2,doc_3
doc_1,1.0,0.666078,0.046005
doc_2,0.666078,1.0,0.039957
doc_3,0.046005,0.039957,1.0
