In [13]:
sentence ={
    'Python is a high-level, general-purpose programming language.',
    'Its design philosophy emphasizes code readability with the use of significant indentation.',
    'Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small- and large-scale projects.',
    'Python is dynamically-typed and garbage-collected.',
    'It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.',
    'It is often described as a "batteries included" language due to its comprehensive standard library.',
    'Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.',
    'Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support.',
    'Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions.',
    'Python 2 was discontinued with version 2.7.18 in 2020.',
    'Python consistently ranks as one of the most popular programming languages',
}

In [14]:
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

In [15]:
from collections import Counter
import math
import numpy as np

def TFIDF(doc_tokens, id_to_word):
    tf_vectors = []
    idf = {}

    #TF 구하기
    for doc in doc_tokens:
        vec = [0.0 for _ in range((len(id_to_word)))]
        word_count = Counter(doc)
        for key, value in word_count.items():
            vec[word_to_id[key]] = value
            #vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
        tf_vectors.append(vec)
    
    #IDF 구하기
    for id, _ in id_to_word.items():
        idf[id] = 0.0
        for doc in tf_vectors:
            if doc[id] > 0:
                idf[id] += 1
    N = len(tf_vectors)            
    idf = {id : np.log((N+1)/(val+1))+1 for id, val in idf.items()}

    #TF-IDF 구하기
    idf_list = [val for _, val in idf.items()]
    tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

    return tf_vectors, idf, tfidf

In [17]:
def cosine_similarity(tfidf):
    # 코사인 유사도 계산
    cos_sim = np.zeros((len(tfidf), len(tfidf)))
    for i in range(len(tfidf)):
        for j in range(len(tfidf)):
            if i == j:
                cos_sim[i][j] = 1.0
            else:
                vec1 = tfidf[i]
                vec2 = tfidf[j]
                dot_product = np.dot(vec1, vec2)
                norm1 = np.linalg.norm(vec1)
                norm2 = np.linalg.norm(vec2)
                if norm1 == 0 or norm2 == 0:
                    cos_sim[i][j] = 0.0
                else:
                    cos_sim[i][j] = dot_product / (norm1 * norm2)
    return cos_sim

In [18]:
doc_tokens, vocab, word_to_id, id_to_word = buildDict(sentence)
tf_vectors, idf, tfidf = TFIDF(doc_tokens, id_to_word)
cos_sim = cosine_similarity(tfidf)


In [19]:
np.set_printoptions(precision=4, suppress=True)
print("Cosine Similarity Matrix:")
print(cos_sim)

Cosine Similarity Matrix:
[[1.     0.1323 0.1072 0.062  0.     0.1065 0.0917 0.     0.     0.0492
  0.0498]
 [0.1323 1.     0.1827 0.1996 0.1541 0.0551 0.1203 0.0403 0.     0.1982
  0.0288]
 [0.1072 0.1827 1.     0.0987 0.1889 0.0853 0.2035 0.0892 0.0701 0.1712
  0.2149]
 [0.062  0.1996 0.0987 1.     0.108  0.1033 0.0497 0.0463 0.     0.0759
  0.1293]
 [0.     0.1541 0.1889 0.108  1.     0.     0.0234 0.1577 0.0476 0.0715
  0.1476]
 [0.1065 0.0551 0.0853 0.1033 0.     1.     0.     0.     0.1    0.1148
  0.0829]
 [0.0917 0.1203 0.2035 0.0497 0.0234 0.     1.     0.0279 0.1398 0.0458
  0.0555]
 [0.     0.0403 0.0892 0.0463 0.1577 0.     0.0279 1.     0.0568 0.
  0.2003]
 [0.     0.     0.0701 0.     0.0476 0.1    0.1398 0.0568 1.     0.0523
  0.    ]
 [0.0492 0.1982 0.1712 0.0759 0.0715 0.1148 0.0458 0.     0.0523 1.
  0.0305]
 [0.0498 0.0288 0.2149 0.1293 0.1476 0.0829 0.0555 0.2003 0.     0.0305
  1.    ]]
