In [73]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [44]:
import nltk

In [84]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from bs4 import BeautifulSoup

In [88]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
lemmatizer_ = WordNetLemmatizer()
stop_words = set(w.rstrip() for w in open("stopwords.txt"))

Use BeautifulSoup to read XML files

In [47]:
positive_reviews = BeautifulSoup(open('/content/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

In [48]:
negative_reviews = BeautifulSoup(open('/content/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

To make sure that the data is equal for both the classes, get only the `len(negative_reviews)` number of samples from positive_reviews.

In [50]:
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [51]:
word_index_map = {}
idx = 0

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Defining a tokenizer using nltk tokenizers and lemmatizer.

Lemmatizer ⟶ Generates root word for a given word.

Lemmatization helps in decrease of vocab size, by replacing the words with same meaning by its root word

In [54]:
def tokenizer_with_nltk(s):
  s = s.lower()
  tokens = nltk.tokenize.word_tokenize(s)
  tokens = [tok for tok in tokens if len(tok)>2] # assuming anyword of 2 or less no. of letters is not useful or meaningful
  tokens = [lemmatizer_.lemmatize(t) for t in tokens]
  tokens = [t for t in tokens if t not in stop_words]
  return tokens

In [55]:
positive_tokenized = []
negative_tokenized = []

Get tokens and add to dictionary for both positive and negative reviews.

In [56]:
for review in positive_reviews:
  tokens = tokenizer_with_nltk(review.text)
  positive_tokenized.append(tokens)
  for token in tokens:
    if token not in word_index_map:
      word_index_map[token] = idx
      idx += 1

In [57]:
for review in negative_reviews:
  tokens = tokenizer_with_nltk(review.text)
  negative_tokenized.append(tokens)
  for token in tokens:
    if token not in word_index_map:
      word_index_map[token] = idx
      idx += 1

In [105]:
word_index_map['life']

99

We now need X as a vector of counts of each word.

For eg: columns are "yes", "good", "no", "bad"

`x = [2, 1, 6, 8]`

here x says, that in the text of that particular input has `yes` 2 times, `bad` 8 times and etc


Defining a function that return data vector for a given `input.text` and `label`.

In [67]:
def tokens_to_vector(tokens, label):
  x = np.zeros(len(word_index_map) + 1)
  for t in tokens:
    i = word_index_map[t]
    x[i] += 1 # increment the count of that word i.e index
  x = x/x.sum()
  x[-1] = label
  return x

In [68]:
X_positive = []
for each_input in positive_tokenized:
  data_input = tokens_to_vector(each_input, 1)
  X_positive.append(data_input)

In [69]:
X_negative = []
for each_input in negative_tokenized:
  data_input = tokens_to_vector(each_input, 0)
  X_negative.append(data_input)

In [70]:
df1 = pd.DataFrame(X_positive)
df2 = pd.DataFrame(X_negative)
df = pd.concat([df1, df2], axis=0)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:,-1:], shuffle = True, random_state = 42, train_size = 0.80)

In [91]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Classification rate: \n")
print("Train set: ", model.score(X_train, y_train))
print("Test set:  ", model.score(X_test, y_test))

Classification rate: 

Train set:  0.72
Test set:   0.6875


In [106]:
print(f"The Confusion Matrix for training dataset: \n {confusion_matrix(y_train, y_train_pred)}")

The Confusion Matrix for training dataset: 
 [[545 254]
 [194 607]]


In [107]:
print(f"The Confusion Matrix for training dataset: \n {confusion_matrix(y_test, y_test_pred)}")

The Confusion Matrix for training dataset: 
 [[131  70]
 [ 55 144]]


In [103]:
threshold = 0.5

print("Positive weights --> imply words in Positive reviews\n")

for word, index in word_index_map.items():
  weight = model.coef_[0][index]
  if weight > threshold:
    print(f"The weight of : '{word}' is {weight}")

print("\n\nNegative weights --> imply words in Negative reviews\n")
for word, index in word_index_map.items():
  weight = model.coef_[0][index]
  if weight < -1*threshold:
    print(f"The weight of : '{word}' is {weight}")

Positive weights --> imply words in Positive reviews

The weight of : 'this' is 0.7648052659411899
The weight of : 'ha' is 0.7760447890839357
The weight of : 'favorite' is 0.6069746678603795
The weight of : 'you' is 1.221296441746252
The weight of : 'life' is 0.5436339442008887
The weight of : 'recommend' is 0.7186541003964967
The weight of : 'read' is 0.892298340427654
The weight of : 'excellent' is 0.6632685280634041
The weight of : 'loved' is 0.5546260159886892
The weight of : 'highly' is 0.6214337162325914
The weight of : 'love' is 0.9582532214621934
The weight of : 'easy' is 0.792185574369176
The weight of : 'wonderful' is 0.5203425742469481


Negative weights --> imply words in Negative reviews

The weight of : 'wa' is -2.283723355450725
The weight of : 'n't' is -2.2655006322464653
The weight of : 'then' is -0.5593789508176457
The weight of : 'author' is -0.5332793738075947
The weight of : 'waste' is -0.6153200072919113
The weight of : 'instead' is -0.6148580992280883
The weight 