In [223]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import re
import inflect
import string
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [224]:
p = inflect.engine()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [225]:
def convert_number_to_text(text):
    words = text.split()
    
    str_arr = []
    
    for word in words:
        if word.isdigit():
            word = p.number_to_words(word)
        str_arr.append(word)
    return ' '.join(str_arr)

In [226]:
input_text = 'There are 3 balls in this bag, and 12 in the other one.'
convert_number_to_text(input_text)

'There are three balls in this bag, and twelve in the other one.'

In [227]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    res = text.translate(translator)
    return res.strip()

In [228]:
input_text = 'Hey, did you know that the summer break is coming? Amazing right !! It\'s only 5 more days !!'
remove_punctuation(input_text)

'Hey did you know that the summer break is coming Amazing right  Its only 5 more days'

In [229]:
def remove_whitespace(text):
    return ' '.join(text.strip().split())

In [230]:
input_text = '   We don\'t need   the given questions'
remove_whitespace(input_text)

"We don't need the given questions"

In [231]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    res = [word for word in word_tokens if word not in stop_words]
    return res

In [232]:
input_text = 'Alan Walker is a sample sentence and we are going to remove the stopwords from this.'
remove_stopwords(remove_punctuation(input_text))

['Alan', 'Walker', 'sample', 'sentence', 'going', 'remove', 'stopwords']

In [233]:
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

In [234]:
input_text = 'Data science uses scientific methods algorithms and many types of processes'
stem_words(input_text)

['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'type',
 'of',
 'process']

In [235]:
def lemmatize_words(text): 
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_tokens]
    return lemmas

In [236]:
input_text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize_words(input_text)

['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']

In [237]:
def pos_tagging(text):
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)

In [238]:
input_text = 'data science uses scientific methods algorithms and many types of processes'
pos_tagging(input_text)

[('data', 'NNS'),
 ('science', 'NN'),
 ('uses', 'VBZ'),
 ('scientific', 'JJ'),
 ('methods', 'NNS'),
 ('algorithms', 'RB'),
 ('and', 'CC'),
 ('many', 'JJ'),
 ('types', 'NNS'),
 ('of', 'IN'),
 ('processes', 'NNS')]

In [239]:
corpus = [
    'John has some cats', 
    'Cats eat fishs', 
    'I eat a big fish'
]
normalization = []

for document in corpus:
    document = document.lower()
    print(document)
    words = remove_stopwords(document)
    words = lemmatize_words(' '.join(words))
    
    normalization.append(words)
normalization

john has some cats
cats eat fishs
i eat a big fish


[['john', 'cat'], ['cat', 'eat', 'fish'], ['eat', 'big', 'fish']]

In [240]:
words = []
for i in normalization:
    for j in i:
        if j not in words:
            words.append(j)
words.sort()
print(words)

['big', 'cat', 'eat', 'fish', 'john']


In [241]:
tf = []

for i in range(len(words)):
    te = [0] * len(normalization)
    for j in range(len(normalization)):
        if words[i] in normalization[j]:
            te[j] += normalization[j].count(words[i])
        te[j] /= len(normalization[j])
    tf.append(te)
    
tf = np.array(tf).T
print(tf)

[[0.         0.5        0.         0.         0.5       ]
 [0.         0.33333333 0.33333333 0.33333333 0.        ]
 [0.33333333 0.         0.33333333 0.33333333 0.        ]]


In [242]:
sentences = [0] * len(words)

for i in range(len(words)):
    for document in normalization:
        if words[i] in document:
            sentences[i] += 1
            
sentences

[1, 2, 2, 2, 1]

In [243]:
idf = [0] * len(words)
for i in range(len(words)):
    idf[i] = math.log((len(normalization) + 1) / (sentences[i] + 1)) + 1
idf

[1.6931471805599454,
 1.2876820724517808,
 1.2876820724517808,
 1.2876820724517808,
 1.6931471805599454]

In [244]:
tf_idf = []

for i in range(len(tf)):
    te = []
    for j in range(len(idf)):
        te.append(tf[i][j] * idf[j])
    tf_idf.append(te)
tf_idf

[[0.0, 0.6438410362258904, 0.0, 0.0, 0.8465735902799727],
 [0.0, 0.42922735748392693, 0.42922735748392693, 0.42922735748392693, 0.0],
 [0.5643823935199818, 0.0, 0.42922735748392693, 0.42922735748392693, 0.0]]

In [245]:
for i in range(len(tf_idf)):
    te = []
    des = 0
    for j in range(len(tf_idf[i])):
        des += tf_idf[i][j] ** 2
    for j in range(len(tf_idf[i])):
        tf_idf[i][j] /= math.sqrt(des)
print(words)
print(tf_idf)

['big', 'cat', 'eat', 'fish', 'john']
[[0.0, 0.6053485081062916, 0.0, 0.0, 0.7959605415681652], [0.0, 0.5773502691896257, 0.5773502691896257, 0.5773502691896257, 0.0], [0.680918560398684, 0.0, 0.5178561161676974, 0.5178561161676974, 0.0]]


In [246]:


tfidfvectorizer = TfidfVectorizer(stop_words='english')

x = tfidfvectorizer.fit_transform(corpus)
tfidfvectorizer.get_feature_names_out()
print(tfidfvectorizer.get_feature_names_out())
print(x)

['big' 'cats' 'eat' 'fish' 'fishs' 'john']
  (0, 1)	0.6053485081062916
  (0, 5)	0.7959605415681652
  (1, 4)	0.680918560398684
  (1, 2)	0.5178561161676974
  (1, 1)	0.5178561161676974
  (2, 3)	0.6227660078332259
  (2, 0)	0.6227660078332259
  (2, 2)	0.4736296010332684
