In [1]:
import numpy as np
import pandas as pd
import torch
import nltk

In [102]:
class Tfidf:
    def __init__(self, alpha=1):
        self.alpha = 1

    def fit(self, X, y=None):
        self.corpus = X
        self.n_docs = len(self.corpus)
        self.all_tokens = [token for tokens in self.corpus for token in tokens]
        self.words = set(self.all_tokens)

        self.word_to_idx = {}
        for i, word in enumerate(self.words):
            if word not in self.word_to_idx:
                self.word_to_idx[word] = i

        self.word_doc_freq = {}
        for doc in self.corpus:
            for word in set(doc):
                if word not in self.word_doc_freq:
                    self.word_doc_freq[word] = 1
                else:
                    self.word_doc_freq[word] += 1

        self.n_words = len(self.words)

        self.lookuptable = np.zeros((self.n_words, self.n_docs))

        for j, doc in enumerate(self.corpus):
            for word in set(doc):
                i = self.word_to_idx[word]
                self.lookuptable[i][j] = self.tf(word, doc) * (np.log((self.n_docs + 1) / (self.word_doc_freq[word] + 1)) + 1)

    def tf(self, word, doc):
        doc_dict = {}
        doc_len = len(doc)

        for token in doc:
            if token not in doc_dict:
                doc_dict[token] = 1
            else:
                doc_dict[token] += 1

        if word in doc_dict:
            return doc_dict[word] / doc_len
        else:
            return None

    def get_vector(self, x):
        idx = self.word_to_idx[x]
        return self.lookuptable[idx, :]

In [103]:
docs = [
    "Tom likes basketball!",
    "Tom reads the books",
    "Apple is a common fruit",
    "A fox jumps over a box",
    "He knows Tom well and he also plays basketball"
]

In [104]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [105]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [106]:
import re
import string

In [107]:
from nltk.tokenize import word_tokenize

In [108]:
stop_words = nltk.corpus.stopwords.words("english")
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [109]:
lemmatizer = nltk.stem.WordNetLemmatizer()


In [110]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [111]:
def preprocess_line(line):
    return [lemmatizer.lemmatize(token) for token in word_tokenize(line.strip().lower()) if token not in string.punctuation and token not in stop_words]

In [112]:
preprocessed_docs = [preprocess_line(line) for line in docs]
preprocessed_docs

[['tom', 'like', 'basketball'],
 ['tom', 'read', 'book'],
 ['apple', 'common', 'fruit'],
 ['fox', 'jump', 'box'],
 ['know', 'tom', 'well', 'also', 'play', 'basketball']]

In [113]:
tfidf_vectorizer = Tfidf()

In [114]:
tfidf_vectorizer.fit(preprocessed_docs)

In [115]:
tfidf_vectorizer.lookuptable

array([[0.        , 0.        , 0.        , 0.69953743, 0.        ],
       [0.46848837, 0.46848837, 0.        , 0.        , 0.23424418],
       [0.        , 0.        , 0.        , 0.69953743, 0.        ],
       [0.        , 0.69953743, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.69953743, 0.        , 0.        ],
       [0.        , 0.        , 0.69953743, 0.        , 0.        ],
       [0.56438239, 0.        , 0.        , 0.        , 0.2821912 ],
       [0.        , 0.        , 0.        , 0.        , 0.34976871],
       [0.        , 0.        , 0.        , 0.        , 0.34976871],
       [0.        , 0.        , 0.        , 0.        , 0.34976871],
       [0.        , 0.        , 0.69953743, 0.        , 0.        ],
       [0.        , 0.69953743, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.69953743, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.34976871],
       [0.69953743, 0.        , 0.

In [130]:
tfidf_vectorizer.lookuptable.shape

(15, 5)

In [118]:
tfidf_vectorizer.word_to_idx

{'jump': 0,
 'tom': 1,
 'box': 2,
 'book': 3,
 'apple': 4,
 'fruit': 5,
 'basketball': 6,
 'well': 7,
 'play': 8,
 'also': 9,
 'common': 10,
 'read': 11,
 'fox': 12,
 'know': 13,
 'like': 14}

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [122]:
tfidf_sklearn = TfidfVectorizer()

In [125]:
preprocessed_lines = [' '.join(line) for line in preprocessed_docs]
preprocessed_lines

['tom like basketball',
 'tom read book',
 'apple common fruit',
 'fox jump box',
 'know tom well also play basketball']

In [127]:
tfidf_sklearn_vectors = tfidf_sklearn.fit_transform(preprocessed_lines)
tfidf_sklearn_vectors

<5x15 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [135]:
tfidf_sklearn_vectors.toarray().T

array([[0.        , 0.        , 0.        , 0.        , 0.4428322 ],
       [0.        , 0.        , 0.57735027, 0.        , 0.        ],
       [0.55681615, 0.        , 0.        , 0.        , 0.35727423],
       [0.        , 0.63907044, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.        ],
       [0.        , 0.        , 0.57735027, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.        ],
       [0.        , 0.        , 0.57735027, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.4428322 ],
       [0.69015927, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.4428322 ],
       [0.        , 0.63907044, 0.        , 0.        , 0.        ],
       [0.4622077 , 0.42799292, 0.        , 0.        , 0.29656989],
       [0.        , 0.        , 0.

In [131]:
tfidf_sklearn_vectors.toarray().shape

(5, 15)

In [134]:
tfidf_sklearn.get_feature_names_out()

array(['also', 'apple', 'basketball', 'book', 'box', 'common', 'fox',
       'fruit', 'jump', 'know', 'like', 'play', 'read', 'tom', 'well'],
      dtype=object)