# DOMAIN: Digital content and entertainment industry

# PROJECT OBJECTIVE: 

**Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments**

In [85]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.datasets import imdb
import nltk

# Import and analyse the data set

In [86]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [87]:
print('Shape of X_train data set is:', X_train.shape, 'Shape of y_train data set is:', y_train.shape)
print('Shape of X_test data set is:', X_test.shape, 'Shape of y_test data set is:', y_test.shape)

Shape of X_train data set is: (25000,) Shape of y_train data set is: (25000,)
Shape of X_test data set is: (25000,) Shape of y_test data set is: (25000,)


# Perform relevant sequence adding on the data

In [88]:
review = np.concatenate((X_train, X_test), axis=0)
sentiment = np.concatenate((y_train, y_test), axis=0)

# Data Analysis

**Print shape of features and labels**

In [89]:
#Number of categories

np.unique(sentiment)

array([0, 1], dtype=int64)

In [90]:
print('Total number of reviews:', review.shape)
print('Total number of sentiments:', sentiment.shape)

Total number of reviews: (50000,)
Total number of sentiments: (50000,)


**Print value of any one feature and it's label**

In [91]:
#displaying the reviwe of 20th record 

print(review[20])

[1, 617, 11, 3875, 17, 2, 14, 966, 78, 20, 9, 38, 78, 15, 25, 413, 2, 5, 28, 8, 106, 12, 8, 4, 130, 43, 8, 67, 48, 12, 100, 79, 101, 433, 5, 12, 127, 4, 769, 9, 38, 727, 12, 186, 398, 34, 6, 312, 396, 2, 707, 4, 732, 26, 1235, 21, 2, 128, 74, 4, 2, 5, 4, 116, 9, 1639, 10, 10, 4, 2, 2, 186, 8, 28, 77, 2586, 39, 4, 4135, 2, 7, 2, 2, 50, 161, 306, 8, 30, 6, 686, 204, 326, 11, 4, 226, 20, 10, 10, 13, 258, 14, 20, 8, 30, 38, 78, 15, 13, 1498, 91, 7, 4, 96, 143, 10, 10, 9859, 9064, 144, 3261, 27, 419, 11, 902, 29, 540, 887, 4, 278]


In [92]:
#displaying the sentiment of 20th record 

print(sentiment[20])

0


In [93]:
#displaying the reviwe of 1050th record which has a sentiment of 1

print(review[1050])

[1, 13, 447, 14, 20, 12, 9, 1281, 8, 79, 6, 3132, 7, 1208, 2, 2571, 5, 14, 20, 9462, 4000, 139, 5, 1127, 5896, 5, 2, 122, 12, 13, 69, 57, 326, 474, 30, 38, 3889, 34, 12, 51, 35, 480, 168, 33, 89, 1536, 9518, 235, 12, 16, 1211, 8, 106, 179, 2035, 75, 32, 391, 4, 997, 5, 4, 7529, 150, 552, 7, 453, 21, 14, 9, 38, 38, 275, 51, 571, 54, 36, 216, 145, 5, 353, 8, 412, 6, 113, 36, 191, 12, 93, 72, 55, 1887, 7, 6, 1058, 604, 7, 349, 15, 26, 2, 187, 416, 11, 938, 24, 502, 8, 2198, 191, 1666, 191, 28, 119, 5625, 191, 855, 19, 1280, 926, 36, 235, 484, 972, 14, 9, 6, 666, 1521, 5, 31, 15, 218, 7470, 195, 1243, 2002, 1194, 263, 2169, 44, 2571, 9518, 75, 40, 98, 150, 21, 38, 51, 12, 152, 306, 8, 28, 93, 101, 1474, 8, 98, 45, 99, 522, 38, 12, 16, 6, 87, 22, 21, 13, 3785, 6, 176, 13, 28, 57, 85, 8044]


In [94]:
#displaying the sentiment of 1050th record which has a sentiment of 1

print(sentiment[1050])

1


# Decode the feature value to get original sentence

In [95]:
review_index = imdb.get_word_index()
review_words = {value:key for key, value in review_index.items()}

In [96]:
" ".join([review_words.get(i - 3, "$") for i in review[20]])

"$ shown in australia as $ this incredibly bad movie is so bad that you become $ and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school $ class the sets are pathetic but $ better than the $ and the acting is wooden br br the $ $ seems to have been stolen from the props $ of $ $ there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money"

**Vectorizing the data**

Review with less than 10000 will be filled 0 so that everything will be equal

In [97]:
def vectorizing(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

In [98]:
review = vectorizing(review)
sentiment = np.array(sentiment).astype("float32")

**Train and Test Data**

In [99]:
review_test = review[:10000]
sentiment_test = sentiment[:10000]

In [100]:
review_train = review[10000:]
sentiment_train = sentiment[10000:]

**Model Building**

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout

Model :- **Logistic Regression**

In [102]:
log_reg = LogisticRegression(solver='lbfgs', penalty='l2',max_iter=500,C=1,random_state=42)

In [103]:
log_reg.fit(review_train, sentiment_train)

LogisticRegression(C=1, max_iter=500, random_state=42)

In [104]:
predict_log_reg = log_reg.predict(review_test)

Model :- **Naive Bayes**

In [105]:
NB = MultinomialNB()
NB.fit(review_train, sentiment_train)

MultinomialNB()

In [106]:
predict_NB = NB.predict(review_test)

Model :- **Random Forest**

In [107]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5, n_estimators=100, oob_score=True)
rf.fit(review_train, sentiment_train)

RandomForestClassifier(max_depth=5, n_jobs=-1, oob_score=True, random_state=42)

In [108]:
predict_rf = rf.predict(review_test)

Model :- **Neural Network**

In [109]:
model = models.Sequential()

# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))

# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))

model.add(layers.Dropout(0.1, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))

# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))

#Compile Model
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [110]:
NN_Model = model.fit(review_train, sentiment_train, epochs= 10, batch_size = 500, validation_data = (review_test, sentiment_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [111]:
from sklearn.metrics import accuracy_score, classification_report

In [112]:
print('Accuracy score for Logistic Regression - ', accuracy_score(sentiment_test, predict_log_reg))
print('Accuracy score for Naive Bayes - ', accuracy_score(sentiment_test, predict_NB))
print('Accuracy score for Random Forest - ', accuracy_score(sentiment_test, predict_rf))
print('Accuracy score for Neural Network - ', np.mean(NN_Model.history['val_accuracy']))

Accuracy score for Logistic Regression -  0.8716
Accuracy score for Naive Bayes -  0.8496
Accuracy score for Random Forest -  0.8192
Accuracy score for Neural Network -  0.8858500003814698


In [113]:
print('          Classification Report for Logistic Regression \n')
print(classification_report(sentiment_test, predict_log_reg, target_names=['Positive','Negative']))

          Classification Report for Logistic Regression 

              precision    recall  f1-score   support

    Positive       0.87      0.87      0.87      4947
    Negative       0.88      0.87      0.87      5053

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [114]:
print('          Classification Report for Naive Bayes \n')
print(classification_report(sentiment_test, predict_NB, target_names=['Positive','Negative']))

          Classification Report for Naive Bayes 

              precision    recall  f1-score   support

    Positive       0.84      0.86      0.85      4947
    Negative       0.86      0.84      0.85      5053

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [115]:
print('          Classification Report for Random Forest \n')
print(classification_report(sentiment_test, predict_rf, target_names=['Positive','Negative']))

          Classification Report for Random Forest 

              precision    recall  f1-score   support

    Positive       0.85      0.78      0.81      4947
    Negative       0.80      0.86      0.83      5053

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



In [116]:
predict_model = model.predict_classes(review_test)

In [117]:
predict_model = predict_model[:, 0]

In [119]:
print('          Classification Report for Neural Network \n')
print(classification_report(sentiment_test, predict_model, target_names=['Positive','Negative']))

          Classification Report for Neural Network 

              precision    recall  f1-score   support

    Positive       0.87      0.88      0.88      4947
    Negative       0.88      0.87      0.88      5053

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



**Inference**

- On the Accuracy perspective **Neural Network** have bee in the top with **88%** while **Random Forest** with least of **81%**
- On the preceision and recall ** Logistic Regression** and **Neural Network** both are at higher side
- On the F1-Score **Neural Network** tops the list

**Validating the Prediction with actual value**

In [121]:
print(predict_model[5])

0


In [122]:
print(sentiment_test[5])

0.0


In [123]:
print(predict_model[10])

1


In [124]:
print(sentiment_test[10])

1.0


# Conclusion

- IMDB data set was loaded with pre defined train and test data frame
- Data Analysis was performed after combining the test and train data set
- Orginal sentence was displayed 
- 4 different models **Logistic Regression**, **Naive Bayes**, **Random Forest** and **Neural Network** model were built
- Accuracy and classification report were created to finalise the best model
- **Neural Network** is the best model with highest accuracy in the test data
- Finally validated the **predicted sentiment** with **actual sentiment**