In [35]:
import pandas as pd
import numpy as np
import nltk
import keras
from keras import Model, regularizers
from keras.layers import Embedding, Dense, Flatten, Dropout, Input
from keras.preprocessing.sequence import pad_sequences

training_data = pd.read_csv('train.csv', names = ['Class', 'Title', 'Description'])
test_data = pd.read_csv('test.csv', names = ['Class', 'Title', 'Description'])
training_data['Text_data'] = training_data.Title + training_data.Description
test_data['Text_data'] = test_data.Title + test_data.Description

def create_vocab(text_data):
    Vocabulary = []
    for text_datum in text_data:
        tokens = nltk.word_tokenize(text_datum)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        for token in tokens:
            if token in Vocabulary:
                continue
            else:
                Vocabulary.append(token)
    return Vocabulary

def create_vocab_indices(vocabulary):
    vocab_indices = {}
    vocab_indices['ZERO_PAD'] = 0
    vocab_indices['UNK'] = 1
    i = 2
    for word in vocabulary:
        vocab_indices[word] = i
        i += 1
    return vocab_indices

def vectorize(text_data, vocab_indices):
    x = []
    for sequence in text_data:
        indices = []
        tokens = nltk.word_tokenize(sequence)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        for token in tokens:
            index = vocab_indices.get(token, 1)
            indices.append(index)
        x.append(indices)
    return x

class Data():
    def __init__(self):
        self.training_data = training_data
        self.test_data = test_data
        self.training_raw_text = list(self.training_data.Text_data.values)
        self.test_raw_text = list(self.test_data.Text_data.values)
        self.num_classes = 4
        
        self.vocab = create_vocab(self.training_raw_text)
        self.vocab_indices = create_vocab_indices(self.vocab)
        self.vocab_size = len(self.vocab_indices)
        
        self.training_x = pad_sequences(vectorize(self.training_raw_text, self.vocab_indices), maxlen = 30, value = 0)
        self.training_t = keras.utils.to_categorical(np.array(self.training_data.Class.values)-1)
        self.test_x = pad_sequences(vectorize(self.test_raw_text, self.vocab_indices), maxlen = 30, value = 0)
        self.test_t = keras.utils.to_categorical(np.array(self.test_data.Class.values)-1)
        

In [33]:
data = Data()

In [43]:
def Computational_Graph(inputs):
    vocab_size = data.vocab_size
    embedding_size = 50
    x = Embedding(input_dim = vocab_size, 
        output_dim = embedding_size, input_length = 30)(inputs)
    x = Flatten()(x)
    x = Dense(units = 64, activation = 'relu', kernel_regularizer = regularizers.l2(0.01))(x)
    x = Dropout(0.2)(x)
    x = Dense(units = 32, activation = 'relu', kernel_regularizer = regularizers.l2(0.01))(x)
    x = Dropout(0.2)(x)
    predictions = Dense(units = 4, activation = 'softmax')(x)
    return predictions
inputs = Input(shape = (30,))
model = Model(inputs = inputs, output = Computational_Graph(inputs))

model.compile(optimizer='Adamax',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy'])

model.fit(data.training_x, data.training_t, 
    verbose = True, epochs = 3, 
    validation_split = 0.15)


  


Train on 102000 samples, validate on 18000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23c80029eb8>

In [45]:
model.fit(data.training_x, data.training_t, 
    verbose = True, epochs = 1)

Epoch 1/1


<keras.callbacks.History at 0x23cd3288898>

In [46]:
score = model.evaluate(data.test_x, data.test_t, verbose = True)
print('Test Error: ', score[1])


Test Error:  0.8835526315789474
