# Download dataset from the below kaggle link

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
# import necessary libraries
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# read the dataset
imdb = pd.read_csv('IMDB Dataset.csv')

In [4]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df = imdb.iloc[:5,:1].copy()
df_test = imdb.iloc[5:10,:1].copy()
df

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."


In [6]:
df_test

Unnamed: 0,review
5,"Probably my all-time favorite movie, a story o..."
6,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i..."
8,Encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...


# Text Preprocessing

### Stopwords Removal:

In [7]:
# Stopwords Removal
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['review'].apply(lambda x: [word for word in word_tokenize(x) if word.lower() not in stop_words])
df

Unnamed: 0,review,no_stopwords
0,One of the other reviewers has mentioned that ...,"[One, reviewers, mentioned, watching, 1, Oz, e..."
1,A wonderful little production. <br /><br />The...,"[wonderful, little, production, ., <, br, /, >..."
2,I thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,"[Basically, 's, family, little, boy, (, Jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, ``, Love, Time, Money, ''..."


In [8]:
# converting back from list to string
df['no_stopwords'] = df['no_stopwords'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,review,no_stopwords
0,One of the other reviewers has mentioned that ...,One reviewers mentioned watching 1 Oz episode ...
1,A wonderful little production. <br /><br />The...,wonderful little production . < br / > < br / ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,Basically 's family little boy ( Jake ) thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love Time Money '' visuall...


### Tokenization:

In [9]:
# Tokenization
df['word_tokens'] = df['no_stopwords'].apply(word_tokenize)
df['sentence_tokens'] = df['no_stopwords'].apply(sent_tokenize)
df

Unnamed: 0,review,no_stopwords,word_tokens,sentence_tokens
0,One of the other reviewers has mentioned that ...,One reviewers mentioned watching 1 Oz episode ...,"[One, reviewers, mentioned, watching, 1, Oz, e...",[One reviewers mentioned watching 1 Oz episode...
1,A wonderful little production. <br /><br />The...,wonderful little production . < br / > < br / ...,"[wonderful, little, production, ., <, br, /, >...","[wonderful little production ., < br / > < br ..."
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",[thought wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,Basically 's family little boy ( Jake ) thinks...,"[Basically, 's, family, little, boy, (, Jake, ...",[Basically 's family little boy ( Jake ) think...
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love Time Money '' visuall...,"[Petter, Mattei, 's, ``, Love, Time, Money, ``...",[Petter Mattei 's `` Love Time Money '' visual...


### Stemming and Lemmatization:

In [10]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
df['stemmed'] = df['word_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
df['lemmatized'] = df['word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
df

Unnamed: 0,review,no_stopwords,word_tokens,sentence_tokens,stemmed,lemmatized
0,One of the other reviewers has mentioned that ...,One reviewers mentioned watching 1 Oz episode ...,"[One, reviewers, mentioned, watching, 1, Oz, e...",[One reviewers mentioned watching 1 Oz episode...,"[one, review, mention, watch, 1, oz, episod, '...","[One, reviewer, mentioned, watching, 1, Oz, ep..."
1,A wonderful little production. <br /><br />The...,wonderful little production . < br / > < br / ...,"[wonderful, little, production, ., <, br, /, >...","[wonderful little production ., < br / > < br ...","[wonder, littl, product, ., <, br, /, >, <, br...","[wonderful, little, production, ., <, br, /, >..."
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",[thought wonderful way spend time hot summer w...,"[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,Basically 's family little boy ( Jake ) thinks...,"[Basically, 's, family, little, boy, (, Jake, ...",[Basically 's family little boy ( Jake ) think...,"[basic, 's, famili, littl, boy, (, jake, ), th...","[Basically, 's, family, little, boy, (, Jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love Time Money '' visuall...,"[Petter, Mattei, 's, ``, Love, Time, Money, ``...",[Petter Mattei 's `` Love Time Money '' visual...,"[petter, mattei, 's, ``, love, time, money, ``...","[Petter, Mattei, 's, ``, Love, Time, Money, ``..."


### POS Tagging:

In [11]:
# POS Tagging
df['pos_tags'] = df['word_tokens'].apply(pos_tag)
df

Unnamed: 0,review,no_stopwords,word_tokens,sentence_tokens,stemmed,lemmatized,pos_tags
0,One of the other reviewers has mentioned that ...,One reviewers mentioned watching 1 Oz episode ...,"[One, reviewers, mentioned, watching, 1, Oz, e...",[One reviewers mentioned watching 1 Oz episode...,"[one, review, mention, watch, 1, oz, episod, '...","[One, reviewer, mentioned, watching, 1, Oz, ep...","[(One, CD), (reviewers, NNS), (mentioned, VBD)..."
1,A wonderful little production. <br /><br />The...,wonderful little production . < br / > < br / ...,"[wonderful, little, production, ., <, br, /, >...","[wonderful little production ., < br / > < br ...","[wonder, littl, product, ., <, br, /, >, <, br...","[wonderful, little, production, ., <, br, /, >...","[(wonderful, JJ), (little, JJ), (production, N..."
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",[thought wonderful way spend time hot summer w...,"[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, time, hot, su...","[(thought, VBN), (wonderful, JJ), (way, NN), (..."
3,Basically there's a family where a little boy ...,Basically 's family little boy ( Jake ) thinks...,"[Basically, 's, family, little, boy, (, Jake, ...",[Basically 's family little boy ( Jake ) think...,"[basic, 's, famili, littl, boy, (, jake, ), th...","[Basically, 's, family, little, boy, (, Jake, ...","[(Basically, NNP), ('s, POS), (family, NN), (l..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love Time Money '' visuall...,"[Petter, Mattei, 's, ``, Love, Time, Money, ``...",[Petter Mattei 's `` Love Time Money '' visual...,"[petter, mattei, 's, ``, love, time, money, ``...","[Petter, Mattei, 's, ``, Love, Time, Money, ``...","[(Petter, NNP), (Mattei, NNP), ('s, POS), (``,..."


# Feature Engineering

### TF-IDF Vectorization:

In [12]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(df['no_stopwords'])
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,10,accustomed,acting,action,actors,addiction,adrian,agenda,agreements,air,...,word,work,world,worth,would,written,years,york,young,zombie
0,0.0,0.062527,0.0,0.0,0.0,0.0,0.0,0.062527,0.062527,0.0,...,0.125053,0.0,0.0,0.0,0.125053,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.100558,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.100558,0.0,0.100558,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.10804,0.0,0.0,0.0,0.10804,...,0.0,0.0,0.0,0.0,0.0,0.0,0.10804,0.0,0.10804,0.0
3,0.098464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196928
4,0.0,0.0,0.072891,0.072891,0.0,0.0,0.072891,0.0,0.0,0.0,...,0.0,0.072891,0.072891,0.0,0.0,0.0,0.0,0.072891,0.0,0.0


### One-Hot Encoding:

In [13]:
# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[['no_stopwords']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
one_hot_df

Unnamed: 0,"no_stopwords_Basically 's family little boy ( Jake ) thinks 's zombie closet & parents fighting time. < br / > < br / > movie slower soap opera ... suddenly , Jake decides become Rambo kill zombie. < br / > < br / > OK , first 're going make film must Decide thriller drama ! drama movie watchable . Parents divorcing & arguing like real life . Jake closet totally ruins film ! expected see BOOGEYMAN similar movie , instead watched drama meaningless thriller spots. < br / > < br / > 3 10 well playing parents & descent dialogs . shots Jake : ignore .","no_stopwords_One reviewers mentioned watching 1 Oz episode 'll hooked . right , exactly happened me. < br / > < br / > first thing struck Oz brutality unflinching scenes violence , set right word GO . Trust , show faint hearted timid . show pulls punches regards drugs , sex violence . hardcore , classic use word. < br / > < br / > called OZ nickname given Oswald Maximum Security State Penitentary . focuses mainly Emerald City , experimental section prison cells glass fronts face inwards , privacy high agenda . Em City home many .. Aryans , Muslims , gangstas , Latinos , Christians , Italians , Irish .... scuffles , death stares , dodgy dealings shady agreements never far away. < br / > < br / > would say main appeal show due fact goes shows would n't dare . Forget pretty pictures painted mainstream audiences , forget charm , forget romance ... OZ n't mess around . first episode ever saw struck nasty surreal , could n't say ready , watched , developed taste Oz , got accustomed high levels graphic violence . violence , injustice ( crooked guards 'll sold nickel , inmates 'll kill order get away , well mannered , middle class inmates turned prison bitches due lack street skills prison experience ) Watching Oz , may become comfortable uncomfortable viewing .... thats get touch darker side .","no_stopwords_Petter Mattei 's `` Love Time Money '' visually stunning film watch . Mr. Mattei offers us vivid portrait human relations . movie seems telling us money , power success people different situations encounter . < br / > < br / > variation Arthur Schnitzler 's play theme , director transfers action present time New York different characters meet connect . one connected one way , another next person , one seems know previous point contact . Stylishly , film sophisticated luxurious look . taken see people live world live habitat. < br / > < br / > thing one gets souls picture different stages loneliness one inhabits . big city exactly best place human relations find sincere fulfillment , one discerns case people encounter. < br / > < br / > acting good Mr. Mattei 's direction . Steve Buscemi , Rosario Dawson , Carol Kane , Michael Imperioli , Adrian Grenier , rest talented cast , make characters come alive. < br / > < br / > wish Mr. Mattei good luck await anxiously next work .","no_stopwords_thought wonderful way spend time hot summer weekend , sitting air conditioned theater watching light-hearted comedy . plot simplistic , dialogue witty characters likable ( even well bread suspected serial killer ) . may disappointed realize Match Point 2 : Risk Addiction , thought proof Woody Allen still fully control style many us grown love. < br / > < br / > 'd laughed one Woody 's comedies years ( dare say decade ? ) . 've never impressed Scarlet Johanson , managed tone `` sexy '' image jumped right average , spirited young woman. < br / > < br / > may crown jewel career , wittier `` Devil Wears Prada '' interesting `` Superman '' great comedy go see friends .","no_stopwords_wonderful little production . < br / > < br / > filming technique unassuming- old-time-BBC fashion gives comforting , sometimes discomforting , sense realism entire piece . < br / > < br / > actors extremely well chosen- Michael Sheen `` got polari '' voices pat ! truly see seamless editing guided references Williams ' diary entries , well worth watching terrificly written performed piece . masterful production one great master 's comedy life . < br / > < br / > realism really comes home little things : fantasy guard , rather use traditional 'dream ' techniques remains solid disappears . plays knowledge senses , particularly scenes concerning Orton Halliwell sets ( particularly flat Halliwell 's murals decorating every surface ) terribly well done ."
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0


### Bag of Words:

In [14]:
# Bag of Words
count_vectorizer = CountVectorizer()
bow_vectors = count_vectorizer.fit_transform(df['no_stopwords'])
bow_df = pd.DataFrame(bow_vectors.toarray(), columns=count_vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,10,accustomed,acting,action,actors,addiction,adrian,agenda,agreements,air,...,word,work,world,worth,would,written,years,york,young,zombie
0,0,1,0,0,0,0,0,1,1,0,...,2,0,0,0,2,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,1,1,0,0,1,0,0,0,...,0,1,1,0,0,0,0,1,0,0


### Unigram, Bigram, n-gram:

In [15]:
# Unigram, Bigram, n-gram
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))

unigram_vectors = unigram_vectorizer.fit_transform(df['no_stopwords'])
bigram_vectors = bigram_vectorizer.fit_transform(df['no_stopwords'])
ngram_vectors = ngram_vectorizer.fit_transform(df['no_stopwords'])

unigram_df = pd.DataFrame(unigram_vectors.toarray(), columns=unigram_vectorizer.get_feature_names_out())
bigram_df = pd.DataFrame(bigram_vectors.toarray(), columns=bigram_vectorizer.get_feature_names_out())
ngram_df = pd.DataFrame(ngram_vectors.toarray(), columns=ngram_vectorizer.get_feature_names_out())

In [16]:
unigram_df

Unnamed: 0,10,accustomed,acting,action,actors,addiction,adrian,agenda,agreements,air,...,word,work,world,worth,would,written,years,york,young,zombie
0,0,1,0,0,0,0,0,1,1,0,...,2,0,0,0,2,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,1,1,0,0,1,0,0,0,...,0,1,1,0,0,0,0,1,0,0


In [17]:
bigram_df

Unnamed: 0,10 well,accustomed high,acting good,action present,actors extremely,addiction thought,adrian grenier,agenda em,agreements never,air conditioned,...,world live,worth watching,would dare,would say,written performed,years dare,york different,young woman,zombie br,zombie closet
0,0,1,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0,1,1,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [18]:
ngram_df

Unnamed: 0,10,10 well,10 well playing,accustomed,accustomed high,accustomed high levels,acting,acting good,acting good mr,action,...,york different,york different characters,young,young woman,young woman br,zombie,zombie br,zombie br br,zombie closet,zombie closet parents
0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2,1,1,1,1
4,0,0,0,0,0,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0


In [19]:
def preprocessing_and_feature_engineering(df_test):
    # Stopwords Removal
    stop_words = set(stopwords.words('english'))
    df_test['no_stopwords'] = df_test['review'].apply(lambda x: [word for word in word_tokenize(x) if word.lower() not in stop_words])
    df_test['no_stopwords'] = df_test['no_stopwords'].apply(lambda x: ' '.join(x))
    
    # Tokenization
    df_test['word_tokens'] = df_test['no_stopwords'].apply(word_tokenize)
    df_test['sentence_tokens'] = df_test['no_stopwords'].apply(sent_tokenize)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    df_test['stemmed'] = df_test['word_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    df_test['lemmatized'] = df_test['word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
    
    # POS Tagging
    df_test['pos_tags'] = df_test['word_tokens'].apply(pos_tag)
    
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectors = tfidf_vectorizer.fit_transform(df_test['no_stopwords'])
    tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    
    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df_test[['no_stopwords']])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
    
    # Bag of Words
    count_vectorizer = CountVectorizer()
    bow_vectors = count_vectorizer.fit_transform(df_test['no_stopwords'])
    bow_df = pd.DataFrame(bow_vectors.toarray(), columns=count_vectorizer.get_feature_names_out())
    
    # Unigram, Bigram, n-gram
    unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
    ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))
    
    unigram_vectors = unigram_vectorizer.fit_transform(df_test['no_stopwords'])
    bigram_vectors = bigram_vectorizer.fit_transform(df_test['no_stopwords'])
    ngram_vectors = ngram_vectorizer.fit_transform(df_test['no_stopwords'])
    
    unigram_df = pd.DataFrame(unigram_vectors.toarray(), columns=unigram_vectorizer.get_feature_names_out())
    bigram_df = pd.DataFrame(bigram_vectors.toarray(), columns=bigram_vectorizer.get_feature_names_out())
    ngram_df = pd.DataFrame(ngram_vectors.toarray(), columns=ngram_vectorizer.get_feature_names_out())
    
    return df_test, tfidf_df, one_hot_df, bow_df, unigram_df, bigram_df, ngram_df

In [20]:
df_test_res, tfidf_df_res, one_hot_df_res, bow_df_res, unigram_df_res, bigram_df_res, ngram_df_res = preprocessing_and_feature_engineering(df_test)

In [21]:
df_test_res

Unnamed: 0,review,no_stopwords,word_tokens,sentence_tokens,stemmed,lemmatized,pos_tags
5,"Probably my all-time favorite movie, a story o...","Probably all-time favorite movie , story selfl...","[Probably, all-time, favorite, movie, ,, story...","[Probably all-time favorite movie , story self...","[probabl, all-tim, favorit, movi, ,, stori, se...","[Probably, all-time, favorite, movie, ,, story...","[(Probably, RB), (all-time, JJ), (favorite, JJ..."
6,I sure would like to see a resurrection of a u...,sure would like see resurrection dated Seahunt...,"[sure, would, like, see, resurrection, dated, ...",[sure would like see resurrection dated Seahun...,"[sure, would, like, see, resurrect, date, seah...","[sure, would, like, see, resurrection, dated, ...","[(sure, RB), (would, MD), (like, VB), (see, VB..."
7,"This show was an amazing, fresh & innovative i...","show amazing , fresh & innovative idea 70 's f...","[show, amazing, ,, fresh, &, innovative, idea,...","[show amazing , fresh & innovative idea 70 's ...","[show, amaz, ,, fresh, &, innov, idea, 70, 's,...","[show, amazing, ,, fresh, &, innovative, idea,...","[(show, NN), (amazing, JJ), (,, ,), (fresh, JJ..."
8,Encouraged by the positive comments about this...,Encouraged positive comments film looking forw...,"[Encouraged, positive, comments, film, looking...",[Encouraged positive comments film looking for...,"[encourag, posit, comment, film, look, forward...","[Encouraged, positive, comment, film, looking,...","[(Encouraged, VBN), (positive, JJ), (comments,..."
9,If you like original gut wrenching laughter yo...,like original gut wrenching laughter like movi...,"[like, original, gut, wrenching, laughter, lik...",[like original gut wrenching laughter like mov...,"[like, origin, gut, wrench, laughter, like, mo...","[like, original, gut, wrenching, laughter, lik...","[(like, IN), (original, JJ), (gut, NN), (wrenc..."


In [22]:
tfidf_df_res

Unnamed: 0,10,15,1990,25,70,950,acting,adventure,air,aired,...,white,work,world,worst,would,wrenching,writing,years,you,young
0,0.0,0.136326,0.0,0.136326,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.109987,0.0,0.0,0.0,0.0,0.109987,0.0,0.0
1,0.094837,0.0,0.0,0.0,0.0,0.0,0.0,0.094837,0.0,0.0,...,0.094837,0.094837,0.076514,0.0,0.459083,0.0,0.0,0.0,0.094837,0.0
2,0.0,0.0,0.091744,0.0,0.091744,0.0,0.0,0.0,0.183487,0.091744,...,0.0,0.0,0.0,0.0,0.074018,0.0,0.091744,0.074018,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.11,0.11,0.0,0.0,0.0,...,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.226493,0.0,0.0,0.0,0.226493


In [23]:
one_hot_df_res

Unnamed: 0,"no_stopwords_Encouraged positive comments film looking forward watching film . Bad mistake . 've seen 950+ films truly one worst - 's awful almost every way : editing , pacing , storyline , 'acting , ' soundtrack ( film 's song - lame country tune - played less four times ) . film looks cheap nasty boring extreme . Rarely happy see end credits film . < br / > < br / > thing prevents giving 1-score Harvey Keitel - far best performance least seems making bit effort . One Keitel obsessives .","no_stopwords_Probably all-time favorite movie , story selflessness , sacrifice dedication noble cause , 's preachy boring . never gets old , despite seen 15 times last 25 years . Paul Lukas ' performance brings tears eyes , Bette Davis , one truly sympathetic roles , delight . kids , grandma says , like `` dressed-up midgets '' children , makes fun watch . mother 's slow awakening 's happening world roof believable startling . dozen thumbs , 'd `` '' movie .","no_stopwords_like original gut wrenching laughter like movie . young old love movie , hell even mom liked it. < br / > < br / > Great Camp ! ! !","no_stopwords_show amazing , fresh & innovative idea 70 's first aired . first 7 8 years brilliant , things dropped . 1990 , show really funny anymore , 's continued decline complete waste time today. < br / > < br / > 's truly disgraceful far show fallen . writing painfully bad , performances almost bad - mildly entertaining respite guest-hosts , show probably would n't still air . find hard believe creator hand-selected original cast also chose band hacks followed . one recognize brilliance see fit replace mediocrity ? felt must give 2 stars respect original cast made show huge success . , show awful . ca n't believe 's still air .","no_stopwords_sure would like see resurrection dated Seahunt series tech today would bring back kid excitement me.I grew black white TV Seahunt Gunsmoke hero 's every week.You vote comeback new sea hunt.We need change pace TV would work world water adventure.Oh way thank outlet like view many viewpoints TV many movies.So ole way believe 've got wan na say.Would nice read plus points sea hunt.If rhymes would 10 lines would let submit , leave doubt quit , must go lets ."
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0


In [24]:
bow_df_res

Unnamed: 0,10,15,1990,25,70,950,acting,adventure,air,aired,...,white,work,world,worst,would,wrenching,writing,years,you,young
0,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,1,0,0,...,1,1,1,0,6,0,0,0,1,0
2,0,0,1,0,1,0,0,0,2,1,...,0,0,0,0,1,0,1,1,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [25]:
unigram_df_res

Unnamed: 0,10,15,1990,25,70,950,acting,adventure,air,aired,...,white,work,world,worst,would,wrenching,writing,years,you,young
0,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,1,0,0,...,1,1,1,0,6,0,0,0,1,0
2,0,0,1,0,1,0,0,0,2,1,...,0,0,0,0,1,0,1,1,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [26]:
bigram_df_res

Unnamed: 0,10 lines,15 times,1990 show,25 years,70 first,950 films,acting soundtrack,adventure oh,air find,aired first,...,would like,would nice,would still,would work,wrenching laughter,writing painfully,years brilliant,years paul,you vote,young old
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,1,0,0,...,1,1,0,1,0,0,0,0,1,0
2,0,0,1,0,1,0,0,0,1,1,...,0,0,1,0,0,1,1,0,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [27]:
ngram_df_res

Unnamed: 0,10,10 lines,10 lines would,15,15 times,15 times last,1990,1990 show,1990 show really,25,...,years brilliant,years brilliant things,years paul,years paul lukas,you,you vote,you vote comeback,young,young old,young old love
0,0,0,0,1,1,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
2,0,0,0,0,0,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import pandas as pd
import numpy as np

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample text dataset (You can replace this with any text corpus)
text = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence. NLP involves analyzing, understanding, and generating languages that humans use naturally."

# 1. Stop Words Removal
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Stop Words Removal:\n", filtered_tokens)

# 2. Tokenization
# Word Tokenization
word_tokens = word_tokenize(text)
print("\nWord Tokenization:\n", word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("\nSentence Tokenization:\n", sentence_tokens)

# 3. Stemming and Lemmatization
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemming:\n", stemmed_tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatization:\n", lemmatized_tokens)

# 4. POS Tagging
pos_tags = pos_tag(filtered_tokens)
print("\nPOS Tagging:\n", pos_tags)

# 5. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_tokens)
print("\nTF-IDF Vectorization:\n", tfidf_matrix.toarray())
print("\nTF-IDF Feature Names:\n", tfidf_vectorizer.get_feature_names_out())

# 6. One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_words = onehot_encoder.fit_transform(np.array(filtered_tokens).reshape(-1, 1))
print("\nOne-Hot Encoding:\n", encoded_words)

# 7. Bag of Words
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(sentence_tokens)
print("\nBag of Words:\n", bow_matrix.toarray())
print("\nBOW Feature Names:\n", count_vectorizer.get_feature_names_out())

# 8. Unigram, Bigram, n-gram
# Unigram
unigram_vectorizer = CountVectorizer(ngram_range=(1,1))
unigram_matrix = unigram_vectorizer.fit_transform(sentence_tokens)
print("\nUnigram Representation:\n", unigram_matrix.toarray())
print("\nUnigram Feature Names:\n", unigram_vectorizer.get_feature_names_out())

# Bigram
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
bigram_matrix = bigram_vectorizer.fit_transform(sentence_tokens)
print("\nBigram Representation:\n", bigram_matrix.toarray())
print("\nBigram Feature Names:\n", bigram_vectorizer.get_feature_names_out())

# n-gram (3-gram)
ngram_vectorizer = CountVectorizer(ngram_range=(3,3))
ngram_matrix = ngram_vectorizer.fit_transform(sentence_tokens)
print("\nn-gram (3-gram) Representation:\n", ngram_matrix.toarray())
print("\nn-gram (3-gram) Feature Names:\n", ngram_vectorizer.get_feature_names_out())


Stop Words Removal:
 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '.', 'NLP', 'involves', 'analyzing', ',', 'understanding', ',', 'generating', 'languages', 'humans', 'use', 'naturally', '.']

Word Tokenization:
 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.', 'NLP', 'involves', 'analyzing', ',', 'understanding', ',', 'and', 'generating', 'languages', 'that', 'humans', 'use', 'naturally', '.']

Sentence Tokenization:
 ['Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.', 'NLP involves analyzing, understanding, and generating languages that humans use naturally.']

Stemming:
 ['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'artifici', 'intellig', '.', 'nlp', 'involv', 'analyz', ',', 'understand', ',', 'gener', 'languag', 'human', 'use', 'natur', '.']

Lemmatization:
 ['Natural', 'Language

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load the IMDB dataset (you may need to change the path if necessary)
df = pd.read_csv('IMDB Dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Select the text column ('review') and label column ('sentiment')
texts = df['review']
labels = df['sentiment']

# Preprocess the text data
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Stop words removal
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to the text data
texts = texts.apply(preprocess_text)

# Example: Using the first review for demonstration
text = texts[0]

# 1. Stop Words Removal (already done in preprocess_text)
tokens = word_tokenize(text)
print("Stop Words Removal:\n", tokens)

# 2. Tokenization
# Word Tokenization
word_tokens = word_tokenize(text)
print("\nWord Tokenization:\n", word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("\nSentence Tokenization:\n", sentence_tokens)

# 3. Stemming and Lemmatization (already done in preprocess_text)
# Stemming and Lemmatization are combined in preprocess_text, so the tokens are already stemmed and lemmatized.

# 4. POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tagging:\n", pos_tags)

# 5. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_tokens)
print("\nTF-IDF Vectorization:\n", tfidf_matrix.toarray())
print("\nTF-IDF Feature Names:\n", tfidf_vectorizer.get_feature_names_out())

# 6. One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_words = onehot_encoder.fit_transform(np.array(tokens).reshape(-1, 1))
print("\nOne-Hot Encoding:\n", encoded_words)

# 7. Bag of Words
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(sentence_tokens)
print("\nBag of Words:\n", bow_matrix.toarray())
print("\nBOW Feature Names:\n", count_vectorizer.get_feature_names_out())

# 8. Unigram, Bigram, n-gram
# Unigram
unigram_vectorizer = CountVectorizer(ngram_range=(1,1))
unigram_matrix = unigram_vectorizer.fit_transform(sentence_tokens)
print("\nUnigram Representation:\n", unigram_matrix.toarray())
print("\nUnigram Feature Names:\n", unigram_vectorizer.get_feature_names_out())

# Bigram
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
bigram_matrix = bigram_vectorizer.fit_transform(sentence_tokens)
print("\nBigram Representation:\n", bigram_matrix.toarray())
print("\nBigram Feature Names:\n", bigram_vectorizer.get_feature_names_out())

# n-gram (3-gram)
ngram_vectorizer = CountVectorizer(ngram_range=(3,3))
ngram_matrix = ngram_vectorizer.fit_transform(sentence_tokens)
print("\nn-gram (3-gram) Representation:\n", ngram_matrix.toarray())
print("\nn-gram (3-gram) Feature Names:\n", ngram_vectorizer.get_feature_names_out())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
