## Importing Movie Reviews dataset

In [1]:
from nltk.corpus import movie_reviews

## Analysing the dataset

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
## 2000 reviews out of which 1000 are positive and 1000 negative
len(movie_reviews.fileids())

2000

In [75]:
# movie_reviews.fileids()

In [76]:
# movie_reviews.fileids("pos")

In [77]:
# movie_reviews.fileids("neg")

In [7]:
movie_reviews.words(movie_reviews.fileids("neg")[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

## Creating Document

In [8]:
## Creating a documents list in which each item has all the tokenised words from a text document and their corresponding categories
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids():
        documents.append((movie_reviews.words(fileid), category))
        
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

## Cleaning Data 

In [9]:
from nltk.corpus import wordnet

## Function to change the pos_tag to simpler values which can be passed to lemmatize
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk import pos_tag

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
wordnet.ADJ

'a'

In [12]:
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [78]:
## Creating stopwords
from nltk.corpus import stopwords
import string

stop = set(stopwords.words("english"))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [83]:
## Function to clean the words by removing stopwords, taking care of cases and lemmatizing the word to give root words only

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [15]:
documents = [(clean_review(words), category) for words, category in documents]

## Shuffling and Splitting Training and Testing Documents

In [16]:
## Shuffling the document since earlier first 1000 where negative and next 1000 where of positive category.
import random
random.shuffle(documents)

In [17]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

## Building Feature Set 

NLTK classifier requires data in format:
[ ( { f1: #val, f2: #val }, category ), 
  (...), (...), ...]
  
A list of tuples for each document where each tuple contains a dictionary of features with corresponding values and the category of the document.

In [18]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [19]:
import nltk 

In [79]:
## Frequency distribution of each words
freq = nltk.FreqDist(all_words)
# freq

In [21]:
freq.most_common(15)

[('film', 8239),
 ('movie', 5311),
 ('one', 4518),
 ('make', 3251),
 ('character', 2949),
 ('like', 2832),
 ('get', 2719),
 ('go', 2312),
 ('see', 2253),
 ('time', 2205),
 ('well', 2158),
 ('scene', 2013),
 ('even', 1859),
 ('good', 1830),
 ('story', 1771)]

In [22]:
## Top 3000 common words
common = freq.most_common(3000)

In [80]:
## Features list
features = [i[0] for i in common]
# features

In [24]:
## Function to create the dictionary for each document with boolean values for each features
def get_features_dict(words):
    ## Creating a feaure dictionary for each document 
    current_features = {}
    words_set = set(words)
    
    for w in features:
        current_features[w] = w in words_set
    
    return current_features 

In [81]:
# get_features_dict(training_documents[0][0])

In [26]:
training_data = [(get_features_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_features_dict(doc), category) for doc, category in testing_documents]

## Classification using NLTK Naive Bayes

In [28]:
from nltk import NaiveBayesClassifier

In [29]:
classifier = NaiveBayesClassifier.train(training_data)

In [30]:
nltk.classify.accuracy(classifier, testing_data)

0.332

In [31]:
classifier.show_most_informative_features(15)

Most Informative Features
                 quentin = True              neg : pos    =      3.7 : 1.0
                    weir = True              neg : pos    =      3.6 : 1.0
                   reese = True              pos : neg    =      3.5 : 1.0
                 branagh = True              pos : neg    =      3.2 : 1.0
                    dora = True              neg : pos    =      3.1 : 1.0
                    echo = True              neg : pos    =      3.1 : 1.0
                 gorilla = True              pos : neg    =      2.9 : 1.0
                  ripley = True              pos : neg    =      2.9 : 1.0
                    todd = True              neg : pos    =      2.7 : 1.0
                  tarzan = True              pos : neg    =      2.7 : 1.0
                 amistad = True              pos : neg    =      2.5 : 1.0
               tarantino = True              neg : pos    =      2.5 : 1.0
                  ordell = True              neg : pos    =      2.4 : 1.0

## Using Sklearn classifier with NLTK

Sklearn and NLTK both requires data to be in different formats: 
* NLTK classifiers require data in form of array of tuples, where each tuple has dictionary of features and category. 
* Sklearn classifiers require data in X Y format, X being a 2D array and Y being output. 

Thus,  NLTK provides a dummy classifier which enables us to use any sklearn classifier with data formed in nltk required format.

* SVC with NLTK

In [33]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier  

In [34]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [35]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [40]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.312

* Random Forest with NLTK

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [39]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [41]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.312

## Count Vectorizer

It keeps data in Sklearn required format. 

* **Tokenisation**: It convert a collection of text documents to a matrix of token counts.
* **Feature Extraction** : It picks up the best features.
* **Sparse Matrix Creation**: Converts each document into the frequency array, based upon the feautures we have chosen.

Important Parameters:
1. **max_features** - build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
2. **ngram_range** : tuple (min_n, max_n) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
3. **analyzer** : string, {‘word’, ‘char’, ‘char_wb’} or callable. Whether the feature should be made of word or character n-grams. 
4. **max_df** : float in range [0.0, 1.0] or int, default=1.0.
    When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold 

5. **min_df** : float in range [0.0, 1.0] or int, default=1.
    When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


Note: 
1. There is an option in count vectorizer(stop_words) which takes list of stop words and can do the work for us.
2. It creates data with frequency counts earlier using nltk we were creating only boolean values for each features.
3. Number of N-grams for a sentence with X words will be (X - N + 1) 
4. Use of N-Grams in your feature space may not necessarily yield any significant improvement.

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
## dummy training data set 
training_set = {"the sky is blue sky", "the sun is bright"}

## create a count vectorizer object with max features to be 3
count_vec = CountVectorizer(max_features=3)

## On fit_transform on training data, Count Vectorizer actually picks the most frequent max_features(in this case 3) 
## number of words and then convert each training row into count/frequency for these features 
a = count_vec.fit_transform(training_set)

## It returns a sparse matrix i.e. where most of matrix contains 0
a

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [50]:
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [51]:
## Displays the features picked by the countVectorizer
count_vec.get_feature_names()

['is', 'sky', 'the']

In [53]:
a = ["he", "is"]
" ".join(a)

'he is'

In [55]:
## Converting our document into x and y
categories = [category for document,category in documents]
text_document = [" ".join(document) for document,category in documents]

In [58]:
## Train-Test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(text_document, categories, random_state = 0)

In [71]:
count_vec = CountVectorizer(max_features=2000, ngram_range=(1,2))
## Fit transform the training data
x_train_features = count_vec.fit_transform(x_train)

## Only transform the testing data according to the features which was fit using x_train
x_test_features = count_vec.transform(x_test)

In [82]:
# count_vec.get_feature_names()

In [67]:
## Applying sklearn classifiers
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [68]:
svc.score(x_test_features, y_test)

0.32700000000000001

## TF-IDF

** Term Frequency** : Number of times term t occurs in document d

$$ TF(t) = \frac {\text{ Number of times term t appears in a document}} {\text {Total number of terms in the document}} $$
** Inverse Document Frequency**:

$$ DF(t) = \frac {\text{Number   of documents with term t in it}} {\text{Total Number of Documents}} $$

$$IDF(t) = {log_e( \frac{\text {Total number of documents}} {\text{Number of documents with term 't' in it}})}$$


Note:
1. Higher value of IDF for term t indicates term t is rare in all document collection. 
2. Sklearn has TF-IDF vectorizer in **sklearn.feature_extraction.text.TfidfVectorizer**.
3. We use product (TF * IDF) as data rather than the frequency counts.

