In [61]:
from string import punctuation
from nltk.corpus import stopwords
from os import listdir
from pickle import dump
#load data
def load_document(file_name):
    file= open(file_name,'r')
    text=file.read()
    file.close()
    return text
#preprocessing the data and creating tokens
def document_to_tokens(document):
    #split the document into words
    tokens=document.split()
    #remove the punctuation
    table=str.maketrans('','',punctuation)
    tokens=[w.translate(table) for w in tokens ]
    #remove the tokens that are not alphabetic
    tokens=[w for w in tokens if w.isalpha()]
    #remove the stop words
    stop_words=stopwords.words('english')
    tokens=[w for w in tokens if not w in stop_words]
    #remove the words -tokens-that has size less or equal to one
    tokens=[w for w in tokens if len(w)>1]
    tokens=' '.join(tokens)
    return tokens 
#load all the documents from the hard disk
def handle_documents(directory,is_train):
    documents=list()
    for file_name in listdir(directory):
        if is_train and file_name.startswith('cv9'):
            continue;

        if not  is_train and not file_name.startswith('cv9'):
            continue;    
        
        path=directory + '/' + file_name

        #load the document
        document=load_document(path)
        #do the preprocessing
        tokens=document_to_tokens(document)

        documents.append(tokens)

    return documents    

def save_data(dataset,filename):
    dump(dataset,open(filename,'wb'))
    print('%s is saved' % filename)     



In [62]:
#load the training files
neg_docs_train=handle_documents('C:\\Users\\yaici\\OneDrive\\Bureau\\CNN for Text Classification\\txt_sentoken\\neg',True)
pos_docs_train=handle_documents('C:\\Users\\yaici\\OneDrive\\Bureau\\CNN for Text Classification\\txt_sentoken\\pos',True)

#load the testing files
neg_docs_test=handle_documents('C:\\Users\\yaici\\OneDrive\\Bureau\\CNN for Text Classification\\txt_sentoken\\neg',False)
pos_docs_test=handle_documents('C:\\Users\\yaici\\OneDrive\\Bureau\\CNN for Text Classification\\txt_sentoken\\pos',False)




In [63]:
trainx= neg_docs_train + pos_docs_train
trainy=[0 for _ in range(900)] + [1 for _ in range(900)] 

save_data([trainx,trainy],'train.pk1')

testx= neg_docs_test + pos_docs_test
testy=[0 for _ in range(100)] + [1 for _ in range(100)] 

save_data([testx,testy],'test.pk1')



train.pk1 is saved
test.pk1 is saved


In [64]:
from pickle import load
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# define a function to load the data 
def load_data(file_name):
    return load(open(file_name,'rb'))

def build_tokenizer(words):
    t = Tokenizer()
    t.fit_on_texts(words)
    return t

#calculate the maximum size
def max_length(docs):
    return max([len(doc.split())for doc in docs])

#encode the words
def encoding(Tokenizer,words,length):
    encoded=Tokenizer.texts_to_sequences(words)
    encoded_pad= pad_sequences(encoded,maxlen=length,padding='post') 
    return encoded_pad

In [65]:
trainx,trainy=load_data('train.pk1')
tokenizer=build_tokenizer(trainx)
#maximum document size
length=max_length(trainx)

# the number of vocabulary
vocab_size = len(tokenizer.word_index) +1

print('maximum document size = %d'% length)
print('the number of vocabulary = %d'% vocab_size)

train_data= encoding(tokenizer,trainx,length)
print(train_data.shape)

maximum document size = 1380
the number of vocabulary = 44277
(1800, 1380)


Build CNN Model

In [66]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.utils import plot_model
# define the model
def build_model(vocab_size,length):
    #first channel
    inputs1=Input(shape=(length,))
    embedding1=Embedding(vocab_size,100)(inputs1)
    conv1=Conv1D(filters=32,kernel_size=4,activation='relu')(embedding1)
    drop1=Dropout(0.5)(conv1)
    pool1=MaxPooling1D(pool_size=2)(drop1)
    flat1=Flatten()(pool1)

    #second channel
    inputs2=Input(shape=(length,))
    embedding2=Embedding(vocab_size,100)(inputs2)
    conv2=Conv1D(filters=32,kernel_size=4,activation='relu')(embedding2)
    drop2=Dropout(0.5)(conv2)
    pool2=MaxPooling1D(pool_size=2)(drop2)
    flat2=Flatten()(pool2)

    #third channel
    inputs3=Input(shape=(length,))
    embedding3=Embedding(vocab_size,100)(inputs3)
    conv3=Conv1D(filters=32,kernel_size=4,activation='relu')(embedding3)
    drop3=Dropout(0.5)(conv3)
    pool3=MaxPooling1D(pool_size=2)(drop3)
    flat3=Flatten()(pool3)

    #merging the three channels
    channels= concatenate([flat1,flat2,flat3]) 
    
    #pass the merged inputs
    dense=Dense(10,activation='relu')(channels)
    outputs=Dense(1,activation='sigmoid')(dense)
    model=Model(inputs=[inputs1,inputs2,inputs3],outputs=outputs)
    return model

    

configure & draw & train the model

In [67]:
import keras
import pydot
import pydotplus
from pydotplus import graphviz
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.utils import plot_model
model=build_model(vocab_size,length)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
plot_model(model,'my_mode.jpg',show_shapes=True)
#print(model.summary())

#training the model
model.fit([train_data,train_data,train_data],array(trainy),epochs=10,batch_size=16)
model.save('my_mode.h5')
 

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 168ms/step - accuracy: 0.5185 - loss: 0.7024
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 213ms/step - accuracy: 0.7359 - loss: 0.5761
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 205ms/step - accuracy: 0.9864 - loss: 0.0643
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 187ms/step - accuracy: 1.0000 - loss: 0.0055
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 196ms/step - accuracy: 1.0000 - loss: 0.0020
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 189ms/step - accuracy: 1.0000 - loss: 0.0011
Epoch 7/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 188ms/step - accuracy: 1.0000 - loss: 8.1571e-04
Epoch 8/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 186ms/step - accuracy: 1.0000 - loss: 6.1745e-04
Epoch 9/



the testing

In [68]:
from keras.models import load_model
#load the model

my_model=load_model('my_mode.h5')
testx,testy==load_data('test.pk1')
test_data=encoding(tokenizer,testx,length)

#evaluation based on the model
loss,accuracy=model.evaluate([test_data,test_data,test_data],array(testy),verbose=0)
print('test accuracy %f' % (accuracy*100))




test accuracy 86.000001
