In [1]:
import string
import re
import os
import numpy as np
from os import listdir
from numpy import array
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Flatten,Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

2022-12-23 00:48:21.910818: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-23 00:48:22.086920: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-23 00:48:22.092473: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-23 00:48:22.092492: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:
# Load doc into memory
def load_doc(filename):
    #Open the file as read only
    file=open(filename,'r')
    #read all text
    text=file.read()
    #close the file
    file.close()
    return text

In [3]:
# turn a doc into clean tokens
def clean_doc(doc,vocab):
    # split into tokens by whitspace
    tokens = doc.split()
    # prepare regex for char filtering 
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    # use the regex to remove the punctuation from the list 
    tokens = [re_punc.sub('',w) for w in tokens]
    # only select the word that is alphabets
    tokens = [word for word in tokens if word in vocab]
    tokens=' '.join(tokens)
    return tokens

In [4]:
#Load all docs in a dictionary
def process_docs(directory,vocab,is_train):
    documents=list()
    
    #Walk through all files in the folder
    for filename in os.listdir(directory):
        
        #skip any review in t he test set
        if is_train and filename.startswith('cv9'):
                continue
        if not is_train and not filename.startswith('cv9'):
            continue
        
        #Create the full path of the file to open 
        path =directory + '/' +filename
        
        #load the doc
        doc=load_doc(path)
        
        #clean doc
        tokens=clean_doc(doc,vocab)
        
        #Add to list
        documents.append(tokens)
        
    return documents
        

In [5]:
# Function to load and clean dataset
def load_clean_dataset(vocab,is_train):
    #load documents
    neg=process_docs('/home/dai/Desktop/NLP/lect 6/review_polarity/review_polarity/txt_sentoken/neg',vocab,is_train)
    pos=process_docs('/home/dai/Desktop/NLP/lect 6/review_polarity/review_polarity/txt_sentoken/pos',vocab,is_train)
    
    docs=neg+pos
    
    #prepare labels
    labels=[0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    
    return docs,labels

In [6]:
# Function to fit a tokenizer
#The tokenizer is object

def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [7]:
#Integer encode and pad documents
def encode_docs(tokenizer,max_length,docs):
    #integer encode
    encoded=tokenizer.texts_to_sequences(docs)
    #pad  sequences
    padded=pad_sequences(encoded,maxlen=max_length,padding='post')
    return padded

In [8]:
#Define the model
def define_model(vocab_size,max_length):
    model=Sequential()
    model.add(Embedding(vocab_size,100,input_length=max_length))
    model.add(Conv1D(filters=32,kernel_size=8,activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    
    #Compile network
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    #Summarize defined model
    model.summary()
    plot_model(model,to_file='model.png',show_shapes=True)
    return model

In [9]:
#Load the Vocabulary
vocab=load_doc('/home/dai/Desktop/NLP/vocab.txt')
vocab=set(vocab.split())

In [10]:
vocab

{'decapitation',
 'canceled',
 'congratulate',
 'franchises',
 'alterego',
 'kidnaped',
 'revels',
 'highflying',
 'darkened',
 'stretched',
 'hunted',
 'cutouts',
 'therapists',
 'gaz',
 'helpful',
 'wronged',
 'pointlessness',
 'fanny',
 'gear',
 'blended',
 'embarks',
 'emperor',
 'cockroaches',
 'stretches',
 'eisner',
 'ethic',
 'xfiles',
 'excelled',
 'influential',
 'announce',
 'aloises',
 'ending',
 'maneating',
 'impregnate',
 'antiterrorist',
 'marketing',
 'propelling',
 'redeeming',
 'sutherland',
 'raunchy',
 'jonnie',
 'excites',
 'democrats',
 'harbor',
 'cleans',
 'christmas',
 'inflicted',
 'conversely',
 'calming',
 'lonergan',
 'improv',
 'encompasses',
 'bowling',
 'charming',
 'sideshow',
 'tysons',
 'tinier',
 'diesel',
 'contrived',
 'johnnys',
 'spins',
 'solid',
 'katanga',
 'jeep',
 'eliciting',
 'deathdefying',
 'endangered',
 'yuri',
 'oppose',
 'wellrounded',
 'fraud',
 'viking',
 'jawdroppingly',
 'doorstep',
 'traumatised',
 'donethat',
 'celluloid',
 'e

In [11]:
#LOad all review
train_docs,ytrain=load_clean_dataset(vocab,True)
test_docs,ytest=load_clean_dataset(vocab,False)

In [12]:
train_docs

['edward burns tackles third picture looking back like previous two workingclass relationship picture however unlike previous work film dwells personal story female protagonist looking back stumbles making slow boring film without spark enlivened previous work claudia lauren holly small town waitress feeling stifled life shes turning point life feels shes going nowhere boyfriend michael jon bon jovi broke dead end job marry shed never get chance escape town enter charlie edward burns claudias old flame skipped town several years without explanations even claudia come back town see suddenly torn stay stable michael never escape hometown ignore instincts fall charlie part answer lies character mother blythe danner fell wrong man spent life pining claudias father return seems claudia make mistakes little past ninety minutes looking back rather short genre unfortunately seems much much longer storyline simple uninspired theres lack energy whole proceedings makes entire drama rather tedious

In [13]:
#Create the tokenizer
#This is Object
tokenizer=create_tokenizer(train_docs)

In [14]:
#Define vocabulary size
vocab_size=len(tokenizer.word_index)+1
print(f"vocabulary size:{vocab_size}")

vocabulary size:25768


In [15]:
#Calculate mazimum seqence length
max_length=max([len(s.split()) for s in train_docs])
print(f"Maximum Length = {max_length}")

Maximum Length = 1317


In [16]:
#Enoded data
#(i.e if word present 1 else 0)

Xtrain=encode_docs(tokenizer,max_length,train_docs)

Xtest=encode_docs(tokenizer,max_length,test_docs)


In [17]:
Xtrain.shape,len(ytrain)

((1800, 1317), 1800)

In [18]:
#Define the network
model=define_model(vocab_size,max_length)

2022-12-23 00:48:28.564087: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-23 00:48:28.564124: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-23 00:48:28.564151: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dai-Precision-7820-Tower): /proc/driver/nvidia/version does not exist
2022-12-23 00:48:28.564525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1317, 100)         2576800   
                                                                 
 conv1d (Conv1D)             (None, 1310, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 655, 32)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 20960)             0         
                                                                 
 dense (Dense)               (None, 10)                209610    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

In [19]:
model.fit(Xtrain,np.array(ytrain),epochs=10,batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f950059e850>

In [20]:
#Evaluate on train set
model.evaluate(Xtrain,np.array(ytrain))



[0.0001193885036627762, 1.0]

In [21]:
model.evaluate(Xtest,np.array(ytest))



[0.5057023763656616, 0.875]

### <b>classify the review as negative or positive<b>

In [22]:
def predict_sentiment(review):  # vocab , tokenizer , model
    # clean review 
    line = clean_doc(review , vocab)
    
    #Encode and padd review
    padded = encode_docs(tokenizer,max_length,[line])
    
    #predict sentiments
    yhat = model.predict(padded, verbose = 0)
    
    #retrieve predicted precentage  and label
    precent_pos = yhat[0,0]
    
    if round(precent_pos) == 0:
        
        return (1 - precent_pos), 'NEGATIVE'
    
    return precent_pos, 'POSITIVE'


### <b>Positive Review<b>

In [23]:
text = 'Everyone will enjoy this film. I love it ,recommended!'

In [24]:
precent , sentiment = predict_sentiment(text)

print(f"Review : {text}\nSentiment : {sentiment} ({round(precent * 100 , 2)}%)")

Review : Everyone will enjoy this film. I love it ,recommended!
Sentiment : POSITIVE (52.62%)


### <b>Negative Review<b>

In [25]:
text1 = 'This is a bad movie. Do not watch it. It sucks.'

In [26]:
precent , sentiment = predict_sentiment(text1)

print(f"Review : {text1}\nSentiment : {sentiment} ({round(precent * 100 , 2)}%)")

Review : This is a bad movie. Do not watch it. It sucks.
Sentiment : NEGATIVE (60.61%)
