# Import Libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM,Embedding,Dense,Conv1D,Dropout,MaxPooling1D,Flatten
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

Using TensorFlow backend.


# Preprocessing

### Initialize Dataset

In [2]:
train_text = fetch_20newsgroups(subset='train').data
train_label = fetch_20newsgroups(subset='train').target
test_text = fetch_20newsgroups(subset='test').data
test_label = fetch_20newsgroups(subset='test').target

### Initialize Constatns

In [3]:
max_words_to_keep = 5000
maxlen_text = 200
token_vec_size = 128
output_dim = np.unique(train_label).__len__()

### Tokenize and pad each newsgroups posts 

In [4]:
tokenizer = Tokenizer(num_words=max_words_to_keep,filters='!"#$%&()*+,\'-./:;<=>?@[\\]^_`{|}~\t\n\"',
                        lower=True,
                        split=" ",
                        char_level=False)
tokenizer.fit_on_texts(train_text)

sequences = tokenizer.texts_to_sequences(train_text)
train_X = pad_sequences(sequences=sequences,maxlen=maxlen_text)

sequences = tokenizer.texts_to_sequences(test_text)
test_X = pad_sequences(sequences=sequences,maxlen=maxlen_text)

### Handle the Label using LabelEncoder

In [5]:
encoder = LabelEncoder()

encoder.fit(train_label)

encoded_train_Y = encoder.transform(train_label)
train_Y = np_utils.to_categorical(encoded_train_Y,num_classes=output_dim)

encoded_train_Y = encoder.transform(test_label)
test_Y = np_utils.to_categorical(encoded_train_Y,num_classes=output_dim)

### Lets print the shapes of data and labels

In [6]:
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

(11314, 200)
(11314, 20)
(7532, 200)
(7532, 20)


# Spatial Modeling

### Define a CNN model

In [7]:
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=max_words_to_keep, output_dim=token_vec_size, input_length=maxlen_text))
cnn_model.add(Dropout(0.2))
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=4))
cnn_model.add(Flatten())
cnn_model.add(Dense(units=output_dim,activation='softmax'))

### Compile the CNN model

In [8]:
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train the CNN model

In [9]:
cnn_model.fit(train_X, train_Y, batch_size=1000,validation_split=0.3, epochs=20)

Train on 7919 samples, validate on 3395 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8e15ba4518>

### Evaluate the prediction on the Test Data

In [10]:
res_class_cnn = cnn_model.predict_classes(test_X)
print(res_class_cnn)



In [11]:
cnn_loss, cnn_accuracy = cnn_model.evaluate(test_X,test_Y,verbose=2)
print(cnn_loss)
print(cnn_accuracy)

1.24981001622
0.605549654806


# Temporal Modeling

### Define an LSTM model

In [12]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words_to_keep, output_dim=token_vec_size, input_length=maxlen_text))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=token_vec_size, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(units=output_dim, activation='softmax'))

### Compile the LSTM model

In [13]:
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train the LSTM model

In [14]:
lstm_model.fit(train_X, train_Y, batch_size=1000,validation_split=0.3, epochs=20)

Train on 7919 samples, validate on 3395 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8dd0adee80>

### Evaluate the prediction on the test data

In [15]:
res_class_lstm = lstm_model.predict_classes(test_X)
print(res_class_lstm)

[ 7  6  0 ..., 11  6 15]


In [16]:
loss_lstm, accuracy_lstm = lstm_model.evaluate(test_X,test_Y,verbose=2)
print(loss_lstm)
print(accuracy_lstm)

1.79106456804
0.412904938927


# Spatio-Temporal Model

### Define a hybrid of CNN and LSTM

In [17]:
hypbrid_model = Sequential()
hypbrid_model.add(Embedding(input_dim=max_words_to_keep, output_dim=token_vec_size, input_length=maxlen_text))
hypbrid_model.add(Dropout(0.2))
hypbrid_model.add(Conv1D(64, 5, activation='relu'))
hypbrid_model.add(MaxPooling1D(pool_size=4))
hypbrid_model.add(LSTM(128))
hypbrid_model.add(Dense(units=output_dim, activation='softmax'))

### Compile the Hybrid model

In [18]:
hypbrid_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train the Hybrid model

In [19]:
hypbrid_model.fit(train_X, train_Y, batch_size=1000,validation_split=0.3, epochs=20)

Train on 7919 samples, validate on 3395 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8db1224c88>

### Evaluate the prediction on the Test Data

In [20]:
res_class_hybrid = hypbrid_model.predict_classes(test_X)
print(res_class_hybrid)



In [21]:
loss_hybrid, accuracy_hybrid = hypbrid_model.evaluate(test_X,test_Y,verbose=2)
print(loss_hybrid)
print(accuracy_hybrid)

1.53089001023
0.573685608136


# Spatio-Temporal Modeling (with multiple tags as output)

### Define,Compile and Train the Model (use sigmoid instead softmax at the last layer)

In [22]:
hybrid_model_mul_tag = Sequential()
hybrid_model_mul_tag.add(Embedding(input_dim=max_words_to_keep, output_dim=token_vec_size, input_length=maxlen_text))
hybrid_model_mul_tag.add(Dropout(0.2))
hybrid_model_mul_tag.add(Conv1D(64, 5, activation='relu'))
hybrid_model_mul_tag.add(MaxPooling1D(pool_size=4))
hybrid_model_mul_tag.add(LSTM(128))
hybrid_model_mul_tag.add(Dense(units=output_dim, activation='sigmoid'))

hybrid_model_mul_tag.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

hybrid_model_mul_tag.fit(train_X, train_Y, batch_size=1000,validation_split=0.3, epochs=20)

Train on 7919 samples, validate on 3395 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8db050fe10>

### Select the tags for which the prob is greater than 0.2 and use the encoder to map it back to the original tags

In [26]:
threshold_tag_prob = 0.2
res_class_prob_hybrid_model_mul_tag = hybrid_model_mul_tag.predict(test_X) > threshold_tag_prob
res_class_hybrid_model_mul_tag = np.array([encoder.inverse_transform(res_class_prob_hybrid_model_mul_tag[i]) for i in np.ndindex(res_class_prob_hybrid_model_mul_tag.shape[:1])])
print(res_class_hybrid_model_mul_tag)

[array([12]) array([1, 3, 4, 5]) array([ 0, 15, 17, 19]) ...,
 array([], dtype=int64) array([2]) array([ 0, 15, 19])]
