In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet
import string
from sklearn.feature_extraction.text import TfidfVectorizer
#!python -m nltk.downloader averaged_perceptron_tagger

# Sample document
sample_document = "Real madrid is set to win the UCL for the season.Benzema might win Balon dor. Salah might be the runner up."

# Tokenization
tokens = word_tokenize(sample_document)

# POS tagging
pos_tags = nltk.pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

# Stemming
porter = PorterStemmer()
stemmed_tokens = [porter.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Calculate Term Frequency (TF)
tf = FreqDist(lemmatized_tokens)

# Calculate Inverse Document Frequency (IDF)
corpus = [sample_document]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
idf = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

print("Tokenization:", tokens)
print("\nPOS Tagging:", pos_tags)
print("\nStop words removal:", filtered_tokens)
print("\nStemming:", stemmed_tokens)
print("\nLemmatization:", lemmatized_tokens)
print("\nTF:", tf)
print("\nIDF:", idf)


Tokenization: ['Real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'UCL', 'for', 'the', 'season.Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'be', 'the', 'runner', 'up', '.']

POS Tagging: [('Real', 'JJ'), ('madrid', 'NN'), ('is', 'VBZ'), ('set', 'VBN'), ('to', 'TO'), ('win', 'VB'), ('the', 'DT'), ('UCL', 'NNP'), ('for', 'IN'), ('the', 'DT'), ('season.Benzema', 'NN'), ('might', 'MD'), ('win', 'VB'), ('Balon', 'NNP'), ('dor', 'NN'), ('.', '.'), ('Salah', 'NNP'), ('might', 'MD'), ('be', 'VB'), ('the', 'DT'), ('runner', 'NN'), ('up', 'RP'), ('.', '.')]

Stop words removal: ['Real', 'madrid', 'set', 'win', 'UCL', 'might', 'win', 'Balon', 'dor', 'Salah', 'might', 'runner']

Stemming: ['real', 'madrid', 'set', 'win', 'ucl', 'might', 'win', 'balon', 'dor', 'salah', 'might', 'runner']

Lemmatization: ['Real', 'madrid', 'set', 'win', 'UCL', 'might', 'win', 'Balon', 'dor', 'Salah', 'might', 'runner']

TF: <FreqDist with 10 samples and 12 outcomes>

IDF: {'balon': 1.0, 'be': 1.

In [16]:
tokens

['Real',
 'madrid',
 'is',
 'set',
 'to',
 'win',
 'the',
 'UCL',
 'for',
 'the',
 'season.Benzema',
 'might',
 'win',
 'Balon',
 'dor',
 '.',
 'Salah',
 'might',
 'be',
 'the',
 'runner',
 'up',
 '.']

In [17]:
pos_tags

[('Real', 'JJ'),
 ('madrid', 'NN'),
 ('is', 'VBZ'),
 ('set', 'VBN'),
 ('to', 'TO'),
 ('win', 'VB'),
 ('the', 'DT'),
 ('UCL', 'NNP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('season.Benzema', 'NN'),
 ('might', 'MD'),
 ('win', 'VB'),
 ('Balon', 'NNP'),
 ('dor', 'NN'),
 ('.', '.'),
 ('Salah', 'NNP'),
 ('might', 'MD'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('runner', 'NN'),
 ('up', 'RP'),
 ('.', '.')]

In [19]:
stemmed_tokens

['real',
 'madrid',
 'set',
 'win',
 'ucl',
 'might',
 'win',
 'balon',
 'dor',
 'salah',
 'might',
 'runner']

In [20]:
lemmatized_tokens

['Real',
 'madrid',
 'set',
 'win',
 'UCL',
 'might',
 'win',
 'Balon',
 'dor',
 'Salah',
 'might',
 'runner']

In [21]:
tf

FreqDist({'win': 2, 'might': 2, 'Real': 1, 'madrid': 1, 'set': 1, 'UCL': 1, 'Balon': 1, 'dor': 1, 'Salah': 1, 'runner': 1})

In [22]:
idf

{'balon': 1.0,
 'be': 1.0,
 'benzema': 1.0,
 'dor': 1.0,
 'for': 1.0,
 'is': 1.0,
 'madrid': 1.0,
 'might': 1.0,
 'real': 1.0,
 'runner': 1.0,
 'salah': 1.0,
 'season': 1.0,
 'set': 1.0,
 'the': 1.0,
 'to': 1.0,
 'ucl': 1.0,
 'up': 1.0,
 'win': 1.0}