<a href="https://colab.research.google.com/github/torquerxf/campusx-nlp-follow/blob/main/assignment_lecture4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 93.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [2]:
import pandas as pd

df = pd.read_csv(path + '/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import re
import string
from textblob import TextBlob
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.stem import PorterStemmer

chat_words_dict = {
    "u": "you",
    "brb": "be right back",
    "tbh": "to be honest",
    "idk": "I don’t know",
    "omw": "on my way",
    "afk": "away from keyboard",
    "np": "no problem",
    "thx": "thanks",
    "btw": "by the way",
    "imo": "in my opinion",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "rofl": "rolling on the floor laughing",
    "icymi": "in case you missed it"
}

def remove_html_tags(text):
  clean_text = re.sub(r'<[^>]+>', '', text)
  return clean_text

def remove_punctuations(text):
  clean_text = text.translate(str.maketrans('', '', string.punctuation))
  return clean_text

def chat_word_treatment(text):
  words = text.split()
  new_words = []
  for word in words:
    if word in chat_words_dict:
      new_words.append(chat_words_dict[word])
    else:
      new_words.append(word)
  replaced_text = ' '.join(new_words)
  return replaced_text

def check_spelling(text):
  corrected_text = TextBlob(text).correct()
  return corrected_text

stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
  words = text.split()
  filtered_words = [word for word in words if word.lower() not in stop_words]
  return " ".join(filtered_words)

def tokenization(text):
  doc = nlp(text)
  return [token.text for token in doc]

stemmer = PorterStemmer()
def stemming(text):
  return [stemmer.stem(word) for word in text]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
# Problem 1

# Apply all the preprocessing techniques that you think are necessary

reviews = df['review']

# lower case
reviews = reviews.str.lower()
# remove html tags
reviews = reviews.apply(remove_html_tags)
# remove punctuations
reviews = reviews.apply(remove_punctuations)
# spelling correction
# reviews = reviews.apply(check_spelling)
# stop words removal
reviews = reviews.apply(remove_stopwords)
# tokenization using spacy
reviews = reviews.apply(tokenization)
# stemming (porter)
reviews = reviews.apply(stemming)

reviews.head()

Unnamed: 0,review
0,"[one, review, mention, watch, 1, oz, episod, y..."
1,"[wonder, littl, product, film, techniqu, unass..."
2,"[thought, wonder, way, spend, time, hot, summe..."
3,"[basic, there, s, famili, littl, boy, jake, th..."
4,"[petter, mattei, love, time, money, visual, st..."


In [12]:
# Problem 2

# Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

def make_corpus(data):
  corpus = []
  for doc in data:
    for word in doc:
      corpus.append(word)
  return corpus

corpus = reviews.apply(lambda x: len(x)).sum()
print("Total number of words in the entire corpus:", corpus)

vocab = set(make_corpus(reviews))
print("Total number of unique words(vocabulary) using just python:", len(vocab))

Total number of words in the entire corpus: 6120805
Total number of unique words(vocabulary) using just python: 182334


In [18]:
# Problem 3

# Apply One Hot Encoding
import numpy as np
from sklearn.preprocessing import OneHotEncoder

tokenized = reviews.head()
vocab = list(set(make_corpus(tokenized)))
print('Vocabulary:', vocab)
print('Vocabulary shape:', len(vocab))
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(np.array(vocab).reshape(-1,1))
encoded_sentence = [encoder.transform(np.array(words).reshape(-1, 1)) for words in tokenized]
print('Encoded sentence shape:', encoded_sentence[1].shape)
encoded_sentence[1]

Vocabulary: ['offer', 'comedi', 'control', 'seem', 'adrian', 'violenc', 'new', 'parent', 'film', 'give', 'remain', 'plot', 'experiment', 'brutal', 'chosen', 'career', 'york', 'impress', 'peopl', 'success', 'mention', 'previou', 'dawson', 'present', 'talent', 'terribl', 'perform', 'boy', 'situat', 'serial', 'level', 'may', 'sit', 'buscemi', 'injustic', 'killer', 'dream', 'nt', 'inmat', 'stylishli', 'mainstream', 'well', 'still', 'sophist', 'paint', '10', 'william', 'discern', 'away', 'direct', 'glass', 'maximum', 'mani', 'big', 'appeal', 'must', 'friend', 'look', 'great', 'place', 'proof', 'prada', 'fulfil', 'sex', 'addict', 'portrait', 'devil', 'divorc', 'oldtimebbc', 'episod', 'year', 'scarlet', 'grown', 'point', 'grenier', 'edit', 'surreal', 'arthur', 'latino', 'director', 'techniqu', 'never', 'review', 'solid', 'loneli', 'luxuri', 'shadi', 'think', 'middl', 'spirit', 'manag', 'slower', 'work', 'summer', 'section', 'punch', 'dare', 'theater', 'christian', 'soap', 'scuffl', 'agenda', 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
# Problem 4

# Apply bag words and find the vocabulary also find the times each word has occured

from sklearn.feature_extraction.text import CountVectorizer

tokenized = reviews.head()

# Join the list of tokens back into a string for each document
tokenized_str = tokenized.apply(lambda x: ' '.join(x))

cv = CountVectorizer(lowercase=False)
bow = cv.fit_transform(tokenized_str)
print('Vocabulary:', len(cv.get_feature_names_out()))
print('Times each word has occured:')
print(bow.toarray())

Vocabulary: 391
Times each word has occured:
[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [1 0 0 ... 0 1 1]
 [0 0 1 ... 0 0 0]]


In [36]:
# Problem 5

# Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

from sklearn.feature_extraction.text import CountVectorizer

tokenized = reviews.head()

# Join the list of tokens back into a string for each document
tokenized_str = tokenized.apply(lambda x: ' '.join(x))

bigram_cv = CountVectorizer(ngram_range=(2,2))
bigram_bow = bigram_cv.fit_transform(tokenized_str)
print('Dimension of bigram vocab:', bigram_cv.get_feature_names_out().shape)

trigram_cv = CountVectorizer(ngram_range=(3,3))
trigram_bow = trigram_cv.fit_transform(tokenized_str)
print('Dimension of trigram vocab:', trigram_cv.get_feature_names_out().shape)

Dimension of bigram vocab: (523,)
Dimension of trigram vocab: (524,)


In [37]:
# Problem 6

# Apply tf-idf and find out the idf scores of words, also find out the vocabulary.

from sklearn.feature_extraction.text import TfidfVectorizer

tokenized = reviews.head()

# Join the list of tokens back into a string for each document
tokenized_str = tokenized.apply(lambda x: ' '.join(x))

tfidf = TfidfVectorizer()
tfidf_bow = tfidf.fit_transform(tokenized_str)
print('Vocabulary:', tfidf.vocabulary_)
print('IDF scores:', tfidf.idf_)

Vocabulary: {'one': 220, 'review': 267, 'mention': 199, 'watch': 364, 'oz': 225, 'episod': 94, 'you': 387, 'll': 177, 'hook': 143, 'right': 268, 'exactli': 98, 'happen': 138, 'meth': 201, 'first': 113, 'thing': 337, 'struck': 316, 'brutal': 31, 'unflinch': 354, 'scene': 276, 'violenc': 360, 'set': 286, 'word': 378, 'go': 126, 'trust': 350, 'show': 292, 'faint': 105, 'heart': 140, 'timid': 343, 'pull': 252, 'punch': 253, 'regard': 263, 'drug': 85, 'sex': 287, 'hardcor': 139, 'classic': 45, 'use': 356, 'wordit': 379, 'call': 33, 'nicknam': 216, 'given': 124, 'oswald': 224, 'maximum': 195, 'secur': 281, 'state': 312, 'penitentari': 230, 'focus': 115, 'mainli': 185, 'emerald': 89, 'citi': 43, 'experiment': 101, 'section': 280, 'prison': 248, 'cell': 38, 'glass': 125, 'front': 118, 'face': 103, 'inward': 155, 'privaci': 249, 'high': 141, 'agenda': 7, 'em': 88, 'home': 142, 'manyaryan': 191, 'muslim': 209, 'gangsta': 121, 'latino': 168, 'christian': 42, 'italian': 157, 'irish': 156, 'moreso'