<a href="https://colab.research.google.com/github/verma-satyam/ML-Coding-Ninjas/blob/main/13.%20NLP/2_Movie_Review_Project_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing and exploring data

In [57]:
import nltk

In [58]:
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [59]:
movie_reviews.categories()

['neg', 'pos']

In [60]:
movie_reviews.fileids() # file names
len(movie_reviews.fileids())

2000

In [61]:
movie_reviews.fileids('neg') # only negative reviews

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [62]:
movie_reviews.words(movie_reviews.fileids()[5])
# list of all words present in 5th file

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

## Building Dataset from text

In [63]:
documents = []
for category in movie_reviews.categories():
  for fileid in movie_reviews.fileids(category):
    documents.append((movie_reviews.words(fileid),category))

In [64]:
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [65]:
# shuffling the data
import random
random.shuffle(documents)
documents[0:5]

[(['ingredients', ':', 'neophyte', 'lawyer', ',', ...], 'pos'),
 (['when', 'i', 'ponder', 'childhood', 'memories', ...], 'neg'),
 (['capsule', ':', 'combine', 'one', 'quart', 'of', ...], 'neg'),
 (['there', "'", 's', 'no', 'reason', 'to', 'doubt', ...], 'neg'),
 (['as', 'a', 'revolutionary', 'war', 'hero', 'in', ...], 'pos')]

## DATA Cleaning

 - Stopword Removal
 - Lemmatization

Importing Stopwords and punctuation.

In [66]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
import string
punc = list(string.punctuation)
stop.update(punc)

Import and converting POS

In [68]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [69]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
  if tag.startswith("J"):
    return wordnet.ADJ
  elif tag.startswith("V"):
    return wordnet.VERB
  elif tag.startswith("N"):
    return wordnet.NOUN
  elif tag.startswith("R"):
    return wordnet.ADV
  else:
    return wordnet.NOUN

Importing Lemma

In [70]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaning Dataset

In [71]:
def clean_review(words):
  output_words = []
  for w in words:
    if w.lower() not in stop:
      pos = pos_tag([w]) #bcz it is expecting an array of words.
      clean_word = lemma.lemmatize(w,pos= get_simple_pos(pos[0][1]))
      output_words.append(clean_word.lower())
  return output_words

In [72]:
documents = [(clean_review(document),category)for document,category in documents]

In [73]:
documents[0]

(['ingredient',
  'neophyte',
  'lawyer',
  'legal',
  'situation',
  'corrupt',
  'insurance',
  'company',
  'synopsis',
  'rudy',
  'baylor',
  'matt',
  'damon',
  'ethical',
  'kid',
  'fresh',
  'law',
  'school',
  'must',
  'juggle',
  'three',
  'legal',
  'situation',
  'time',
  'rudy',
  'girlfriend',
  'attack',
  'violent',
  'husband',
  'rudy',
  'elderly',
  'landlady',
  'want',
  'arrange',
  'child',
  'exclude',
  'family',
  'rudy',
  'friend',
  'leukemia',
  'sue',
  'corrupt',
  'insurance',
  'company',
  'pay',
  'bone',
  'marrow',
  'transplant',
  'rudy',
  'new',
  'lawyer',
  'thoroughly',
  'outgunned',
  'luckily',
  'aid',
  'sleazy',
  'ambulance',
  'chaser',
  'danny',
  'devito',
  'fail',
  'bar',
  'exam',
  'six',
  'time',
  'well',
  'kindly',
  'preside',
  'judge',
  'danny',
  'glover',
  'jon',
  'voight',
  'play',
  'leo',
  'f',
  'drummond',
  'intimidate',
  'arrogant',
  'leader',
  'powerful',
  'insurance',
  'company',
  'team',


### Creating Features

In [74]:
# train_test_split, allready random documents so esse hi select kr lege
training_doc = documents[0:1500]
testing_doc = documents[1500:]

In [75]:
all_words = []
for doc in training_doc:
  all_words += doc[0]

In [76]:
freq = nltk.FreqDist(all_words)
freq.most_common(15)

[('film', 8431),
 ('movie', 5135),
 ('one', 4527),
 ('make', 3235),
 ('like', 2988),
 ('character', 2983),
 ('get', 2741),
 ('see', 2347),
 ('go', 2251),
 ('time', 2196),
 ('well', 2091),
 ('scene', 2026),
 ('even', 1985),
 ('good', 1806),
 ('story', 1700)]

In [77]:
features = [i[0] for i in freq.most_common(3000)] #finding top 3000 words
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'bad',
 'come',
 '--',
 'life',
 'give',
 'two',
 'way',
 'seem',
 'look',
 'know',
 'first',
 'end',
 'work',
 'year',
 'thing',
 'plot',
 'say',
 'play',
 'show',
 'really',
 'little',
 'people',
 'man',
 'star',
 'could',
 'love',
 'great',
 'try',
 'never',
 'best',
 'new',
 'performance',
 'director',
 'big',
 'action',
 'many',
 'actor',
 'want',
 'find',
 'u',
 'watch',
 'role',
 'think',
 'act',
 'another',
 'world',
 'audience',
 'something',
 'turn',
 'back',
 'day',
 'still',
 'old',
 'however',
 'set',
 'use',
 'around',
 'every',
 'guy',
 'real',
 'though',
 'begin',
 'part',
 'feel',
 'comedy',
 'run',
 'cast',
 'point',
 'write',
 'enough',
 'interest',
 'last',
 'almost',
 'script',
 'young',
 'may',
 'name',
 'long',
 'effect',
 'right',
 'funny',
 'actually',
 'nothing',
 'woman',
 'fact',
 'fri

In [78]:
def get_features_dict(words):
  current_features = {}
  wordsSet = set(words)
  for w in features:
    current_features[w] = w in wordsSet
  return current_features

In [79]:
training_data = [(get_features_dict(doc), cat) for doc,cat in training_doc]
testing_data = [(get_features_dict(doc), cat) for doc,cat in testing_doc]

In [80]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': False,
  'go': False,
  'time': True,
  'well': True,
  'scene': False,
  'even': False,
  'good': True,
  'story': False,
  'take': False,
  'would': False,
  'much': False,
  'also': True,
  'bad': False,
  'come': True,
  '--': False,
  'life': False,
  'give': False,
  'two': True,
  'way': True,
  'seem': False,
  'look': False,
  'know': False,
  'first': True,
  'end': True,
  'work': False,
  'year': False,
  'thing': True,
  'plot': False,
  'say': False,
  'play': True,
  'show': False,
  'really': False,
  'little': True,
  'people': False,
  'man': False,
  'star': False,
  'could': False,
  'love': False,
  'great': False,
  'try': True,
  'never': False,
  'best': False,
  'new': True,
  'performance': False,
  'director': True,
  'big': True,
  'action': False,
  'many': False,
  'actor': False,
  'want': True,
  'find': False,
  'u': False,
  'wat

## Classifier

In [81]:
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(training_data)

In [82]:
nltk.classify.accuracy(clf,testing_data)

0.79

In [83]:
clf.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     10.9 : 1.0
              schumacher = True              neg : pos    =     10.2 : 1.0
             beautifully = True              pos : neg    =      8.0 : 1.0
            breathtaking = True              pos : neg    =      7.7 : 1.0
                 garbage = True              neg : pos    =      7.2 : 1.0
                    lame = True              neg : pos    =      6.8 : 1.0
                   inept = True              neg : pos    =      6.7 : 1.0
                 refresh = True              pos : neg    =      6.6 : 1.0
                 freddie = True              neg : pos    =      6.5 : 1.0
                    jude = True              pos : neg    =      6.4 : 1.0
                     sat = True              neg : pos    =      6.4 : 1.0
                   awful = True              neg : pos    =      6.3 : 1.0
                   anger = True              pos : neg    =      6.2 : 1.0

### To use SKLearn Classifier on NLTK dataset

In [85]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
svc = SVC()
clf_sklearn = SklearnClassifier(svc)

In [86]:
clf_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))>

In [87]:
nltk.classify.accuracy(clf_sklearn,testing_data)

0.836

In [88]:
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
rfc = RandomForestClassifier()
clf_sklearn1 = SklearnClassifier(rfc)
clf_sklearn1.train(training_data)
nltk.classify.accuracy(clf_sklearn1,testing_data)

0.81