In [1]:
import re
import string
from time import time 

import torch
import torch.nn as nn
from tqdm import tqdm
from torchsummary import summary

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('stopwords')
punct = string.punctuation
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### ***Taking data input***

In [5]:
data = pd.read_csv("C:\\Users\\Ayush\\Desktop\\Let_us_start_once_again\\NLP\\1_IMDB REVIEW ANALYSIS\\imdb_complete.csv")     
data.head(10)

Unnamed: 0,text,polarity
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1
5,"memory ""The Last Hunt"" stuck since saw 1956 13...",1
6,"Shakespeare fan, appreciate Ken Branagh done b...",0
7,privilege watching Scarface big screen beautif...,1
8,real classic. shipload sailors trying get town...,1
9,Serials short subjects originally shown theate...,1


In [7]:
def preprocessing(string, stopwords, stemmer):
    '''We can do all the preprocessing in just one step by creating a pipeline
    First, we have to make all the words in lowercase,
    then we have to tokenize the string,
    then we have to remove stopwords and 
    finally we have to stem all the words.
    This is how it will be ready to be analyzed further'''
    string  = string.lower()
    tokens = re.split('\s|(?<!\d)[,.](?!\d)', string)
    clean_tokens = []
    for word in tokens:
        if word not in stopwords:
            clean_tokens.append(word)
    
    stemmed_words = []
    for word in clean_tokens:
        stemmed_words.append(stemmer.stem(word))
    
    preprocessed_array = []
    for word in stemmed_words:
        if word!='':
            preprocessed_array.append(word)
            
    return preprocessed_array
#########################################
data['preprocessed'] = data['text'].apply(lambda x:preprocessing(x, stopwords_english, stemmer))    
data.head(10)

Unnamed: 0,text,polarity,preprocessed
0,"first think another Disney movie, might good, ...",1,"[first, think, anoth, disney, movi, might, goo..."
1,"Put aside Dr. House repeat missed, Desperate H...",0,"[put, asid, dr, hous, repeat, miss, desper, ho..."
2,"big fan Stephen King's work, film made even gr...",1,"[big, fan, stephen, king', work, film, made, e..."
3,watched horrid thing TV. Needless say one movi...,0,"[watch, horrid, thing, tv, needless, say, one,..."
4,truly enjoyed film. acting terrific plot. Jeff...,1,"[truli, enjoy, film, act, terrif, plot, jeff, ..."
5,"memory ""The Last Hunt"" stuck since saw 1956 13...",1,"[memori, ""the, last, hunt"", stuck, sinc, saw, ..."
6,"Shakespeare fan, appreciate Ken Branagh done b...",0,"[shakespear, fan, appreci, ken, branagh, done,..."
7,privilege watching Scarface big screen beautif...,1,"[privileg, watch, scarfac, big, screen, beauti..."
8,real classic. shipload sailors trying get town...,1,"[real, classic, shipload, sailor, tri, get, to..."
9,Serials short subjects originally shown theate...,1,"[serial, short, subject, origin, shown, theate..."


### ***Now we have to form the vocabulary***

In [17]:
# now that splits are done, we have to first form the vocabulary of words
# then set a frequencey of words under which no words will be in vocabulary
# replace our preprocessed_sets with <unk>
def build_vocab(preprocessed_data, min_freq=2):
    vocab = {}
    closed_vocab = {}
    for message in preprocessed_data:
        for word in message:
            if word in vocab.keys():
                vocab[word]+=1
            else:
                vocab[word] = 1
    # now we have to filter out the words in the vocabulary
    for word, count in vocab.items():
        if count>min_freq:
            closed_vocab[word] = count
    
    return vocab, closed_vocab

In [20]:
preprocessed_data = data['preprocessed'].iloc[:]
polarity = data['polarity'][:]
vocab, closed_vocab = build_vocab(preprocessed_data, min_freq=1)
print(len(vocab), len(closed_vocab))

83804 34287


### ***Now we have to preprocess our data to tackle unknown words***