In [1]:
## Text Classification using NLP

In [2]:
# Importing libraries
import os
import pathlib
import numpy as numpy
import nltk
import pickle
import re
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Chaitanya
[nltk_data]     V\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Importing dataset (Original Source of Dataset is "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
reviews = load_files("txt_sentoken/") # load files function should be used only for small datasets.
x,y = reviews.data,reviews.target # Since neg folder is first in text_sentoken, it's class is 0 and for pos its class is 1.

In [4]:
# Storing as pickle files
with open('x.pickle','wb') as f:
    pickle.dump(x,f)

with open('y.pickle','wb') as f:
    pickle.dump(y,f)

In [5]:
# Unpickling the dataset
with open('x.pickle','rb') as f:
    x = pickle.load(f)

with open('y.pickle','rb') as f:
    y = pickle.load(f)
    
# print(x[1:2],y[1:2])

In [6]:
# Creating the corpus by pre-processing the dataset
corpus = []

for i in range(len(x)):
    review = re.sub(r'\\n','',str(x[i])) #Replacing all \n (new-line tags which are created during unpickling) with a single space
    review = re.sub(r'\W',' ',review) # Replacing special characters with a single space.
    review = review.lower() # Lowercasing if any uppercase letters
    review = re.sub(r'\s+[a-z]\s+',' ',review) # Replacing single alphabets surrounded by single space on both sides with a single space
    review = re.sub(r'^b\s+',' ',review) # Replacing single alphabets following a single space at start of the sentence with a single space
    review = re.sub(r'\s+b$',' ',review) # Replacing single alphabets followed a single space at end of the sentence with a single space
    review = re.sub(r'\s+',' ',review) # Replacing mutliple consecutive spaces with a single space
    corpus.append(review)

# print(corpus[1:2])

In [7]:
# Binary Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words=stopwords.words("english")) 
# max_features=2000 means we are selecting top 2000 words as our desired features
# min_df = 3 (implies 3 sentences out of all sentences in corpuse) implies if a word is in less than 3 sentences of our dataset it will be excluded immediately
# max_df = 0.6(implies 60% of whole corpus) implies that if any word is seen more than 60% of time in the whole corpus it will be excluded as it might be a common word
# stop_words = stopwords.words("english") implies removal of english based stopwords from the whole corpus
X = vectorizer.fit_transform(corpus).toarray()

In [8]:
# TF-IDF Model using already existing Binary Bag of Words Model

from sklearn.feature_extraction.text import TfidfTransformer
transformer =  TfidfTransformer()
X = transformer.fit_transform(X).toarray()

In [9]:
# Splitting whole corpus into two parts
# Train dataset and Test dataset
# Which means we will train our model on train dataset first.
# And then validate its performance, accuracy using test dataset.

from sklearn.model_selection import train_test_split
data_train,data_test,label_train,label_test=train_test_split(X,y,test_size=0.2,random_state=0)
# X = Our processed,organized data.
# Y = Labels
# test_size=0.2 means test dataset is 20% of total corpus => train dataset is 80% of total corpus
# random_state=0 implies we are not choosing randomly. We can change it to 1 to make multiple test datasets and train datasets depending on your understanding.
# Training data is stored in data_train
# Testing data is stored in data_test
# Training Labels are stored in label_train
# Testing Labels are stored in label_test

In [10]:
# Using Logistic Regression Algorithm (Binary Classification) to fit our model
# Logistic Regression is being used to find optimal weights for each document in corpus to normalize label values betwwen 0 or 1.
# Equation of Logistic Regression is:
# 
# ln(y/y-1) = a+bx₁+cx₂+……..+dx₂₀₀₀
# 

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(data_train,label_train) # Inputting train data and labels to Logistic Regression Classifier
# Trained our classifier on train dataset



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Now, evaluating our classifier accuracy on test dataset
label_pred = classifier.predict(data_test)

from sklearn.metrics import confusion_matrix
cm =  confusion_matrix(label_test,label_pred)
# Checking between actual labels and predicted labels on test dataset
# This gives us a confusion matrix as follows:

#-- -------------------------------------
# ░ |        0        |        1        |
#----------------------------------------
# 0 | True Negatives  | False Negatives |
#---|------------------------------------
# 1 | False Positives | True Positives  |
#----------------------------------------

# Where True Negatives means both actual and model predicted labels as negative (in this case negative = 0) 
# Where True Positives means both actual and model predicted labels as positive (in this case positive = 1)
# Above both are correct predictions and we need more of them.

# Below both are incorrect predictions and we need only few or almost none of them.
# Where False Negatives means actual label are positives, but our model predicted them as negatives.
# Where False Positives means actual label are negatives, but our model predicted them as positives.

In [12]:
# So, let's see how our model fared.
print(cm)

# It shows the result matrix as

#----------------------------------------------------
# ░ |           0           |           1           |
#----------------------------------------------------
# 0 | 169 (True Negatives)  |  39 (False Negatives) |
#---|------------------------------------------------
# 1 |  22 (False Positives) | 170 (True Positives)  |
#----------------------------------------------------

accuracy = (cm[0][0] + cm[1][1])/4
# So, out of 400 labelled data, our model predicted labels for 169+170=339 data correctly.
# Which means our accuracy is 339/400 = 0.8475
# Our Prediction Accuracy is 84.75% as of now using non-randomness of 20% test, 80% train data with logistic regression model.
# It can be made better by tweaking the parameters of CountVectorizer, Tfidftransformer. 
# But, still as a starter this can be called good accuracy

[[169  39]
 [ 22 170]]
