1. Load the dataset into a Pandas DataFrame and extract the text and label columns.

In [102]:
# First column contains review, second column contains the label(Positive, negative, neutral).

print("Load the given CSV file containing text and label columns into a Pandas DataFrame.")

import pandas as pd

# https://www.kaggle.com/code/akanksha10/sentiment-analysis-dataset/input
df = pd.read_csv('test.csv', encoding='latin1')
 
df = df.drop(columns=[col for col in df.columns if col not in ['text', 'sentiment']])

df = df.dropna()
df = df.drop(df.index[100:])
df = df.rename(columns={'sentiment':'label'})

Load the given CSV file containing text and label columns into a Pandas DataFrame.


2. Perform tokenization on all documents and store the tokens corresponding to each document.

In [103]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab', quiet=True)

df['tokens'] = df['text'].apply(lambda x: word_tokenize(str(x).lower()))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.isalpha()])

print(f"Tokenization completed for {len(df)} documents")
print(f"Original text: {df['text'].iloc[0]}")
print(f"Tokens: {df['tokens'].iloc[0]}")

Tokenization completed for 100 documents
Original text: Last session of the day  http://twitpic.com/67ezh
Tokens: ['last', 'session', 'of', 'the', 'day', 'http']


3. Apply case folding by converting all tokens to lowercase.

In [104]:
for i in range(3):
    print(f"Example - Document {i+1}:")
    print(f"Original text: {df['text'].iloc[i]}")
    print(f"Tokens: {df['tokens'].iloc[i]}\n")
print("NLTK Tokenization ensures that the tokens are already lowercase")

Example - Document 1:
Original text: Last session of the day  http://twitpic.com/67ezh
Tokens: ['last', 'session', 'of', 'the', 'day', 'http']

Example - Document 2:
Original text:  Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China:  (SH)  (BJ).
Tokens: ['shanghai', 'is', 'also', 'really', 'exciting', 'precisely', 'skyscrapers', 'galore', 'good', 'tweeps', 'in', 'china', 'sh', 'bj']

Example - Document 3:
Original text: Recession hit Veronique Branquinho, she has to quit her company, such a shame!
Tokens: ['recession', 'hit', 'veronique', 'branquinho', 'she', 'has', 'to', 'quit', 'her', 'company', 'such', 'a', 'shame']

NLTK Tokenization ensures that the tokens are already lowercase


4. Remove stop-words from the tokenized documents.

In [105]:
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords', quiet=True)

# Load English stop-words
stop_words = set(stopwords.words('english'))

df['tokens_without_stopwords'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.head())
print(f"Stop-word removal completed.")

                                                text     label  \
0  Last session of the day  http://twitpic.com/67ezh   neutral   
1   Shanghai is also really exciting (precisely -...  positive   
2  Recession hit Veronique Branquinho, she has to...  negative   
3                                        happy bday!  positive   
4             http://twitpic.com/4w75p - I like it!!  positive   

                                              tokens  \
0                [last, session, of, the, day, http]   
1  [shanghai, is, also, really, exciting, precise...   
2  [recession, hit, veronique, branquinho, she, h...   
3                                      [happy, bday]   
4                                [http, i, like, it]   

                            tokens_without_stopwords  
0                         [last, session, day, http]  
1  [shanghai, also, really, exciting, precisely, ...  
2  [recession, hit, veronique, branquinho, quit, ...  
3                                      [happy,

5. Apply stemming to the filtered tokens and store the stemmed tokens separately.
6. Create a new column containing the final preprocessed text after stemming by joining the stemmed tokens.

In [106]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

df['stemmed_tokens'] = df['tokens_without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
print(df.head())

df['stemmed_preprocessed_text'] = df['stemmed_tokens'].apply(lambda x: ' '.join(x))
print(df[['text', 'stemmed_preprocessed_text']].head())

                                                text     label  \
0  Last session of the day  http://twitpic.com/67ezh   neutral   
1   Shanghai is also really exciting (precisely -...  positive   
2  Recession hit Veronique Branquinho, she has to...  negative   
3                                        happy bday!  positive   
4             http://twitpic.com/4w75p - I like it!!  positive   

                                              tokens  \
0                [last, session, of, the, day, http]   
1  [shanghai, is, also, really, exciting, precise...   
2  [recession, hit, veronique, branquinho, she, h...   
3                                      [happy, bday]   
4                                [http, i, like, it]   

                            tokens_without_stopwords  \
0                         [last, session, day, http]   
1  [shanghai, also, really, exciting, precisely, ...   
2  [recession, hit, veronique, branquinho, quit, ...   
3                                      [ha

7. Construct the Bag-of-Words vocabulary using the stemmed text.

In [107]:
token_population_stemmed = [stoken for stokens in df['stemmed_tokens'] for stoken in stokens]

# Create unique vocabulary
vocab = list(set(token_population_stemmed))

print(f"\nVocabulary size: {len(vocab)}")
print(f"Sample BoW tokens: {vocab[:30]}")


Vocabulary size: 435
Sample BoW tokens: ['fun', 'papaya', 'today', 'dumbfac', 'breaki', 'everyon', 'bought', 'world', 'know', 'china', 'registr', 'noon', 'man', 'figur', 'beep', 'loo', 'spend', 'sweet', 'return', 'though', 'branquinho', 'explod', 'yesterday', 'eat', 'alway', 'sweden', 'would', 'rain', 'heard', 'tonight']


8. Determine the size of the stemming-based vocabulary and list the top 10 most frequent words.

In [108]:
from collections import Counter
token_counts = Counter(token_population_stemmed)
print(f"\nVocabulary size: {len(set(token_population_stemmed))}")
print("\nTop 10 most common tokens in Bag of Words:")
for token, freq in token_counts.most_common(10):
    print(f"Token: {token}, Frequency: {freq}")


Vocabulary size: 435

Top 10 most common tokens in Bag of Words:
Token: day, Frequency: 10
Token: go, Frequency: 9
Token: happi, Frequency: 7
Token: like, Frequency: 6
Token: im, Frequency: 6
Token: time, Frequency: 6
Token: get, Frequency: 6
Token: need, Frequency: 6
Token: watch, Frequency: 5
Token: look, Frequency: 5


9. Represent each document using Bag-of-Words frequency vectors and generate the document–term matrix (DTM) for the stemmed text.

In [109]:
from collections import Counter
import numpy as np

# Create unique vocabulary
vocab = list(set(token_population_stemmed))
vocab_dict = {word: idx for idx, word in enumerate(vocab)}

# Vectorize each document based on its own tokens
def vectorize_document(tokens):
    vector = np.zeros(len(vocab))
    token_freq = Counter(tokens)
    for token, count in token_freq.items():
        if token in vocab_dict:
            vector[vocab_dict[token]] = count
    return vector

# Apply vectorization to all documents using stemmed tokens
dtm = np.array([vectorize_document(tokens) for tokens in df['stemmed_tokens']])

print(f"DTM shape: {dtm.shape}")
print(f"(Number of documents: {dtm.shape[0]}, Vocabulary size: {dtm.shape[1]})")
print(f"\nFirst document vector (first 20 dimensions):\n{dtm[0][:20]}")

DTM shape: (100, 435)
(Number of documents: 100, Vocabulary size: 435)

First document vector (first 20 dimensions):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


10. Apply lemmatization to the filtered tokens and store the lemmatized tokens separately.

In [110]:
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()

df['lemmatized_tokens'] = df['tokens_without_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.head())
print(f"Lemmatization completed.")

                                                text     label  \
0  Last session of the day  http://twitpic.com/67ezh   neutral   
1   Shanghai is also really exciting (precisely -...  positive   
2  Recession hit Veronique Branquinho, she has to...  negative   
3                                        happy bday!  positive   
4             http://twitpic.com/4w75p - I like it!!  positive   

                                              tokens  \
0                [last, session, of, the, day, http]   
1  [shanghai, is, also, really, exciting, precise...   
2  [recession, hit, veronique, branquinho, she, h...   
3                                      [happy, bday]   
4                                [http, i, like, it]   

                            tokens_without_stopwords  \
0                         [last, session, day, http]   
1  [shanghai, also, really, exciting, precisely, ...   
2  [recession, hit, veronique, branquinho, quit, ...   
3                                      [ha

11. Create a new column containing the final preprocessed text after lemmatization by joining the lemmatized tokens.

In [111]:
df['lemmatized_preprocessed_text'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))
print(df[['text', 'lemmatized_preprocessed_text']].head())

                                                text  \
0  Last session of the day  http://twitpic.com/67ezh   
1   Shanghai is also really exciting (precisely -...   
2  Recession hit Veronique Branquinho, she has to...   
3                                        happy bday!   
4             http://twitpic.com/4w75p - I like it!!   

                        lemmatized_preprocessed_text  
0                              last session day http  
1  shanghai also really exciting precisely skyscr...  
2  recession hit veronique branquinho quit compan...  
3                                         happy bday  
4                                          http like  


12. Construct the Bag-of-Words vocabulary using the lemmatized text.

In [112]:
token_population_lemmatized = [ltoken for ltokens in df['lemmatized_tokens'] for ltoken in ltokens]

# Create unique vocabulary
vocab_lemma = list(set(token_population_lemmatized))

print(f"\nLemmatization-based Vocabulary size: {len(vocab_lemma)}")
print(f"Sample BoW tokens: {vocab_lemma[:30]}")


Lemmatization-based Vocabulary size: 453
Sample BoW tokens: ['fun', 'papaya', 'today', 'watched', 'bought', 'sorry', 'world', 'know', 'every', 'china', 'noon', 'man', 'beep', 'loo', 'spend', 'sweet', 'return', 'though', 'branquinho', 'yesterday', 'eat', 'sweden', 'would', 'rain', 'heard', 'corky', 'tonight', 'left', 'gum', 'x']


13. Determine the size of the lemmatization-based vocabulary and list the top 10 most frequent words.

In [113]:
from collections import Counter

token_counts_lemma = Counter(token_population_lemmatized)
print(f"\nLemmatization-based Vocabulary size: {len(set(token_population_lemmatized))}")
print("\nTop 10 most common lemmatized tokens:")
for token, freq in token_counts_lemma.most_common(10):
    print(f"Token: {token}, Frequency: {freq}")


Lemmatization-based Vocabulary size: 453

Top 10 most common lemmatized tokens:
Token: day, Frequency: 10
Token: happy, Frequency: 7
Token: like, Frequency: 6
Token: im, Frequency: 6
Token: time, Frequency: 6
Token: go, Frequency: 6
Token: u, Frequency: 5
Token: need, Frequency: 5
Token: know, Frequency: 5
Token: http, Frequency: 4


14. Represent each document using Bag-of-Words frequency vectors and generate the document–term matrix (DTM) for the lemmatized text.

In [114]:
from collections import Counter
import numpy as np

# Create unique vocabulary
vocab_lemma = list(set(token_population_lemmatized))
vocab_dict_lemma = {word: idx for idx, word in enumerate(vocab_lemma)}

# Vectorize each document
def vectorize_document_lemma(tokens):
    vector = np.zeros(len(vocab_lemma))
    token_freq = Counter(tokens)
    for token, count in token_freq.items():
        if token in vocab_dict_lemma:
            vector[vocab_dict_lemma[token]] = count
    return vector

# Create DTM for lemmatized tokens
dtm_lemma = np.array([vectorize_document_lemma(tokens) for tokens in df['lemmatized_tokens']])

print(f"DTM shape: {dtm_lemma.shape}")
print(f"(Number of documents: {dtm_lemma.shape[0]}, Vocabulary size: {dtm_lemma.shape[1]})")
print(f"\nFirst document vector (first 20 dimensions):\n{dtm_lemma[0][:20]}")

DTM shape: (100, 453)
(Number of documents: 100, Vocabulary size: 453)

First document vector (first 20 dimensions):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


15. Select any one document and display its BoW vector from:
- the stemming-based DTM
- the lemmatization-based DTM

In [115]:
print("\nDocument 1 BoW vector (Stemming-based DTM):")
print(dtm[5])
print("\nDocument 1 BoW vector (Lemmatization-based DTM):")
print(dtm_lemma[5])


Document 1 BoW vector (Stemming-based DTM):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.

16. Identify words that appear in most documents and words that appear in very few documents, and discuss their significance in text classification.

In [116]:
print("10 most common tokens in the stemming-based BoW:")
from collections import Counter
token_counts_stem = Counter(token_population_stemmed)
for token, freq in token_counts_stem.most_common(10):
    print(f"Token: {token}, Frequency: {freq}")

print()

print("10 most common tokens in the lemmatization-based BoW:")
token_counts_lemma = Counter(token_population_lemmatized)
for token, freq in token_counts_lemma.most_common(10):
    print(f"Token: {token}, Frequency: {freq}")

print()

print("10 least common tokens in the stemming-based BoW:")
for token, freq in token_counts_stem.most_common()[:-11:-1]:
    print(f"Token: {token}, Frequency: {freq}")

print()

print("10 least common tokens in the lemmatization-based BoW:")
for token, freq in token_counts_lemma.most_common()[:-11:-1]:
    print(f"Token: {token}, Frequency: {freq}")



10 most common tokens in the stemming-based BoW:
Token: day, Frequency: 10
Token: go, Frequency: 9
Token: happi, Frequency: 7
Token: like, Frequency: 6
Token: im, Frequency: 6
Token: time, Frequency: 6
Token: get, Frequency: 6
Token: need, Frequency: 6
Token: watch, Frequency: 5
Token: look, Frequency: 5

10 most common tokens in the lemmatization-based BoW:
Token: day, Frequency: 10
Token: happy, Frequency: 7
Token: like, Frequency: 6
Token: im, Frequency: 6
Token: time, Frequency: 6
Token: go, Frequency: 6
Token: u, Frequency: 5
Token: need, Frequency: 5
Token: know, Frequency: 5
Token: http, Frequency: 4

10 least common tokens in the stemming-based BoW:
Token: class, Frequency: 1
Token: haaaw, Frequency: 1
Token: deadlin, Frequency: 1
Token: uk, Frequency: 1
Token: noon, Frequency: 1
Token: meet, Frequency: 1
Token: afternoon, Frequency: 1
Token: return, Frequency: 1
Token: decid, Frequency: 1
Token: tri, Frequency: 1

10 least common tokens in the lemmatization-based BoW:
Token: c

These words show that both (stemming-based and lemmatization-based methods) capture meaning well. The very common and very rare words are the same across both the text representations. Most of the words are neutral and do not contribute to sentiment classification.