In [17]:
from multiprocessing import cpu_count
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.model_selection import train_test_split

os.environ['OMP_NUM_THREADS'] = str(cpu_count())
notebook_dir = !pwd
# Some issues with jupyter, so had to add these
REPO_PATH = notebook_dir[0].rsplit("/notebooks", 1)[0]
sys.path.append(REPO_PATH)
# Import Geiger modules
from geiger.utils import load_coling_data, load_word_vectors
from geiger import coling, transform, models, evaluate
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
# Get the data
x_train, x_dev, y_train, y_dev = load_coling_data(os.path.join(REPO_PATH, "datasets"))
# Define some constants
n_classes = 3
max_features = 30000
maxlen = 100
embed_size = 300
batch_size = 16
epochs = 2

In [40]:
y_train = np.asarray([coling.one_hot_encode(class_tag) for class_tag in y_train])
y_dev = np.asarray([coling.one_hot_encode(class_tag) for class_tag in y_dev])
print(y_train.shape)
print(y_dev.shape)
print(len(x_train))
print(len(x_dev))

(11999, 3)
(3001, 3)
11999
3001


In [None]:
# Load the embedding lookup, this takes a while so don't re excute this cell over and over again.
embed_lookup = load_word_vectors(os.path.join(REPO_PATH, "resources/wiki-news-300d-1M-subword.vec"))

In [41]:
# Initialize the transformer
transformer = transform.KerasTransformer(list(x_train) + list(x_dev), max_features, maxlen)
# Generate embedding Matrix
embed_matrix = transformer.generate_embedding_matrix(embed_lookup, embed_size)

Word kejriwal is out of vocabulary.
Word anuj is out of vocabulary.
Word hazare is out of vocabulary.
Word modiji is out of vocabulary.
Word azaan is out of vocabulary.
Word nahid is out of vocabulary.
Word jio is out of vocabulary.
Word champcash is out of vocabulary.
Word afreen is out of vocabulary.
Word bhakts is out of vocabulary.
Word 💢 is out of vocabulary.
Word indianexpress is out of vocabulary.
Word 🏡 is out of vocabulary.
Word jnu is out of vocabulary.
Word ambani is out of vocabulary.
Word 🏨 is out of vocabulary.
Word evm is out of vocabulary.
Word 😂😂😂 is out of vocabulary.
Word 😂😂 is out of vocabulary.
Word afrin is out of vocabulary.
Word ninda is out of vocabulary.
Word 4569825 is out of vocabulary.
Word kangana is out of vocabulary.
Word erdoğan is out of vocabulary.
Word tv18 is out of vocabulary.
Word kejri is out of vocabulary.
Word 😂😂😂😂 is out of vocabulary.
Word bhakt is out of vocabulary.
Word bahubali is out of vocabulary.
Word tayyab is out of vocabulary.
Word m

Word padhi is out of vocabulary.
Word 😃😃😃 is out of vocabulary.
Word dnyanesh is out of vocabulary.
Word kanvinde is out of vocabulary.
Word aajtak is out of vocabulary.
Word sochte is out of vocabulary.
Word manappuram is out of vocabulary.
Word gaurang is out of vocabulary.
Word 😬 is out of vocabulary.
Word kgp is out of vocabulary.
Word socide is out of vocabulary.
Word 😡😡 is out of vocabulary.
Word sabko is out of vocabulary.
Word muhoorat is out of vocabulary.
Word zinta is out of vocabulary.
Word 🎁 is out of vocabulary.
Word 💛 is out of vocabulary.
Word ❥ is out of vocabulary.
Word fenku is out of vocabulary.
Word cmg is out of vocabulary.
Word emraan is out of vocabulary.
Word curroption is out of vocabulary.
Word bikhari is out of vocabulary.
Word sunrisers is out of vocabulary.
Word nawazs is out of vocabulary.
Word ri8 is out of vocabulary.
Word lagao is out of vocabulary.
Word achche is out of vocabulary.
Word 50days is out of vocabulary.
Word jikku is out of vocabulary.
Wor

Word studeid is out of vocabulary.
Word minnority is out of vocabulary.
Word buddhastupa is out of vocabulary.
Word भूकंप is out of vocabulary.
Word सिर्फ़ is out of vocabulary.
Word वर्तमान is out of vocabulary.
Word तहस is out of vocabulary.
Word नहस is out of vocabulary.
Word अतीत is out of vocabulary.
Word बरबाद is out of vocabulary.
Word गईं। is out of vocabulary.
Word काठमांडू is out of vocabulary.
Word वैश्विक is out of vocabulary.
Word स्वयंभूनाथ is out of vocabulary.
Word नुकसान is out of vocabulary.
Word पहुंचा is out of vocabulary.
Word वीडियो is out of vocabulary.
Word fsfh6l is out of vocabulary.
Word parray is out of vocabulary.
Word hajin is out of vocabulary.
Word jaagey is out of vocabulary.
Word vandemataram is out of vocabulary.
Word 🙏🏼🙏🏼🙏🏼🙏🏼🙏🏼🙏🏼 is out of vocabulary.
Word sychosis is out of vocabulary.
Word jeso is out of vocabulary.
Word 7000crore is out of vocabulary.
Word maharajpur is out of vocabulary.
Word koraon is out of vocabulary.
Word mehroni is out of voc

Word else’s is out of vocabulary.
Word there’s is out of vocabulary.
Word that’s is out of vocabulary.
Word quoto is out of vocabulary.
Word mallikarjun is out of vocabulary.
Word adcom is out of vocabulary.
Word 😯😯😯😯😯😯😯🍁 is out of vocabulary.
Word darussalam is out of vocabulary.
Word 😂😂😂statue is out of vocabulary.
Word accordig is out of vocabulary.
Word expectatins is out of vocabulary.
Word isrel is out of vocabulary.
Word moneeeeyyyyyy is out of vocabulary.
Word rakhna is out of vocabulary.
Word sahab😃😃😈😈 is out of vocabulary.
Word prabhakaran is out of vocabulary.
Word kasturi is out of vocabulary.
Word bot👷 is out of vocabulary.
Word powered💂 is out of vocabulary.
Word by🎃 is out of vocabulary.
Word ishaq👩 is out of vocabulary.
Word seriese is out of vocabulary.
Word asked🙂 is out of vocabulary.
Word centere is out of vocabulary.
Word balochs is out of vocabulary.
Word reionales is out of vocabulary.
Word inccident is out of vocabulary.
Word nexon is out of vocabulary.
Word ira

Word histary is out of vocabulary.
Word व्हो is out of vocabulary.
Word पीछा is out of vocabulary.
Word कांग्रेस is out of vocabulary.
Word nimmans is out of vocabulary.
Word sameaa is out of vocabulary.
Word sutup is out of vocabulary.
Word kejrival is out of vocabulary.
Word malini is out of vocabulary.
Word multibageer is out of vocabulary.
Word joota is out of vocabulary.
Word savejnu is out of vocabulary.
Word advenzymes is out of vocabulary.
Word laden” is out of vocabulary.
Word samco is out of vocabulary.
Word conveninced is out of vocabulary.
Word hrithik is out of vocabulary.
Word 😍😍😍😍😍 is out of vocabulary.
Word grommed is out of vocabulary.
Word parrent is out of vocabulary.
Word liqor is out of vocabulary.
Word nalini is out of vocabulary.
Word veryyy is out of vocabulary.
Word dikhegi is out of vocabulary.
Word fayda is out of vocabulary.
Word bhooddddhhaaa is out of vocabulary.
Word talwars is out of vocabulary.
Word ⁠⁠⁠⁠ is out of vocabulary.
Word proudofthecountry is o

In [37]:
print(embed_matrix.shape)

(25699, 300)


In [42]:
# Instantiate the model
model = models.build_pooled_gru(n_classes, transformer.rel_features, maxlen, embed_matrix, embed_size)

In [43]:
X_train = transformer.texts_to_seq(x_train)
X_test = transformer.texts_to_seq(x_dev)

In [44]:
RocAuc = evaluate.RocAucEvaluation(validation_data=(X_test, y_dev), interval=1)
# Train the model
hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_dev),
                 callbacks=[RocAuc])

Train on 11999 samples, validate on 3001 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.766552 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.758199 

