In [None]:
from string import punctuation
from os import listdir
from pickle import dump
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords


# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'rb')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

def clean_doc(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

# save a dataset to file
def save_dataset(dataset, filename):
	dump(dataset, open(filename, 'wb'))
	print('Saved: %s' % filename)

file_train = pd.read_csv('train.tsv', header=0, delimiter="\t", quoting=3)
doc = list()
for i in file_train["Sentence"]:
    doc.append(clean_doc(i))
xtrain = doc 
y = file_train["Sentiment"]
ytrain = []
NumberofSize = file_train["Sentiment"].size
for i in list(range(0,NumberofSize)):
        if file_train["Sentiment"][i] == 'negative' :
            ytrain.append(0)
        if file_train["Sentiment"][i] == 'neutral' :
            ytrain.append(1)
        if file_train["Sentiment"][i] == 'positive' :
            ytrain.append(2)
ytrain = np.array(ytrain)
from keras.utils.np_utils import to_categorical
ytrain = to_categorical(ytrain,3)
save_dataset([xtrain,ytrain], 'train.pkl')
#print(xtrain[0])



file_test = pd.read_csv('test.tsv', header=0, delimiter="\t", quoting=3)
doc = list()
for i in file_test["Sentence"]:
    doc.append(clean_doc(i))
xtest = doc 
for i in range(0,file_testr["Sentiment"].size):
    ytest.append(0)
ytest = np.array(ytest)
save_dataset([xtest,ytest], 'test.pkl')


In [8]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

# load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

# define the model
def define_model(length, vocab_size):
	# channel 1
	inputs1 = Input(shape=(length,))
	embedding1 = Embedding(vocab_size, 100)(inputs1)
	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
	drop1 = Dropout(0.5)(conv1)
	pool1 = MaxPooling1D(pool_size=2)(drop1)
	flat1 = Flatten()(pool1)
	# channel 2
	inputs2 = Input(shape=(length,))
	embedding2 = Embedding(vocab_size, 100)(inputs2)
	conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
	drop2 = Dropout(0.5)(conv2)
	pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat2 = Flatten()(pool2)
	# channel 3
	inputs3 = Input(shape=(length,))
	embedding3 = Embedding(vocab_size, 100)(inputs3)
	conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
	drop3 = Dropout(0.5)(conv3)
	pool3 = MaxPooling1D(pool_size=2)(drop3)
	flat3 = Flatten()(pool3)
	# merge
	merged = concatenate([flat1, flat2, flat3])
	# interpretation
	dense1 = Dense(10, activation='relu')(merged)
	outputs = Dense(3, activation='sigmoid')(dense1)
	model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
	# compile
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize
	print(model.summary())
	plot_model(model, show_shapes=True, to_file='multichannel.png')
	return model

# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX],array(trainLabels), validation_split = 0.2 ,epochs=10, batch_size=16)
# save the model
model.save('model.h5')

Max document length: 42
Vocabulary size: 23256
(10026, 42)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 42)           0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 42)           0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 42)           0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 42, 100)      2325600     input_10[0][0]                   
__________________________________________________

In [51]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

# load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

# load datasets
trainLines, trainLabels = load_dataset('train.pkl')
testLines,_ = load_dataset('test.pkl')

# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
print(type(trainLines[0]))
print(type(testLines[0]))
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)


# load the model
model = load_model('model.h5')

# evaluate model on training dataset
loss, acc = model.evaluate([trainX,trainX,trainX], array(trainLabels), verbose=0)
print('Train Accuracy: %f' % (acc*100))

# evaluate model on test dataset dataset
result = model.predict([testX,testX,testX])
temp = np.argmax(result, axis=1)
tran_result = []
print(result)
#print(shape(result))
for i in temp:
    #print(result[i])
    if i == 0 :
        tran_result.append('negative')
        continue
    if i == 1 :
        tran_result.append('neutral')
        continue
    if i == 2:
        tran_result.append('positive')
        continue
    #print(result[i])
output = pd.DataFrame( data={"id":file_test["ID2"],"polarity":tran_result} )
output.to_csv( "final.csv", index=False, quoting=3 )

Max document length: 42
Vocabulary size: 23256
<class 'str'>
<class 'str'>
(10026, 42) (4850, 42)
Train Accuracy: 91.751446
[[5.9514374e-01 7.7429116e-03 1.5357733e-03]
 [1.4647841e-04 2.7095377e-03 4.8537850e-03]
 [1.6975552e-02 2.8869808e-01 1.4538169e-03]
 ...
 [7.2340190e-01 3.4855008e-03 2.4922192e-03]
 [2.2023916e-05 8.4182024e-03 2.1578130e-01]
 [1.2159646e-03 1.4361161e-03 2.1285225e-02]]
