In [1]:
from multiprocessing import cpu_count
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

os.environ['OMP_NUM_THREADS'] = str(cpu_count())
notebook_dir = !pwd
# Some issues with jupyter, so had to add these
REPO_PATH = notebook_dir[0].rsplit("/notebooks", 1)[0]
sys.path.append(REPO_PATH)
# Import Geiger modules
from geiger.utils import load_word_vectors
from geiger import coling, transform, models, evaluate, stores
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [2]:
# Get the data
x_train, x_dev, y_train, y_dev = coling.load_coling_data(os.path.join(REPO_PATH, "datasets"))
fb_doc_ids_en, fb_x_test_en, fb_y_test_en = coling.load_coling_file(os.path.join(REPO_PATH, "datasets/agr_en_fb_test.csv"))
sm_doc_ids_en, sm_x_test_en, sm_y_test_en = coling.load_coling_file(os.path.join(REPO_PATH, "datasets/agr_en_sm_test.csv"))

# Define some constants
n_classes = 3
max_features = 30000
maxlen = 100
embed_size = 300
batch_size = 16
epochs = 10

In [3]:
print(y_train.shape)
print(y_dev.shape)
print(len(x_train))
print(len(x_dev))

(11999, 3)
(3001, 3)
11999
3001


In [4]:
# Load the embedding lookup, this takes a while so don't re excute this cell over and over again.
embed_lookup = stores.MultiLangVectorStore(langs=["en"])

reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki-news-300d-1M-subword.vec


In [5]:
# Initialize the transformer
transformer = transform.KerasTransformer(list(x_train) + list(x_dev) + list(fb_x_test_en) + list(sm_x_test_en), max_features, maxlen)
# Generate embedding Matrix
embed_matrix = transformer.generate_embedding_matrix(embed_lookup, embed_size)

Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name


 34%|███▍      | 8100/23600 [00:00<00:00, 40459.96it/s]

Could not find vector for word shutdownjnu.
Could not find vector for word jnu.
Could not find vector for word indvsuae.
Could not find vector for word indvspak.
Could not find vector for word kejriwal.
Could not find vector for word anuj.
Could not find vector for word cwc15.
Could not find vector for word modiji.
Could not find vector for word hazare.
Could not find vector for word azaan.
Could not find vector for word standwithjnu.
Could not find vector for word phonenumber.
Could not find vector for word jio.
Could not find vector for word champcash.
Could not find vector for word bhakts.
Could not find vector for word nahid.
Could not find vector for word afreen.
Could not find vector for word manmohan.
Could not find vector for word ambani.
Could not find vector for word evm.
Could not find vector for word tv18.
Could not find vector for word virat.
Could not find vector for word ninda.
Could not find vector for word kejri.
Could not find vector for word bhakt.
Could not find vec

 45%|████▍     | 10560/23600 [00:00<00:00, 35174.32it/s]

Could not find vector for word vishwanath.
Could not find vector for word 0days.
Could not find vector for word aaram.
Could not find vector for word jaate.
Could not find vector for word kujili.
Could not find vector for word dewali.
Could not find vector for word rexit.
Could not find vector for word debasis.
Could not find vector for word unlimitedbot.
Could not find vector for word azaadi.
Could not find vector for word jaihind.
Could not find vector for word haksar.
Could not find vector for word dekhna.
Could not find vector for word manav.
Could not find vector for word nalinbhai.
Could not find vector for word majboori.
Could not find vector for word midcaps.
Could not find vector for word snapdeal.
Could not find vector for word rundians.
Could not find vector for word sushil.
Could not find vector for word shaala.
Could not find vector for word darpok.
Could not find vector for word singhji.
Could not find vector for word pislam.
Could not find vector for word gurudevji.
Coul

 62%|██████▏   | 14687/23600 [00:00<00:00, 25821.35it/s]


Could not find vector for word raaz.
Could not find vector for word tadipaar.
Could not find vector for word vijayvargiya.
Could not find vector for word fekendra.
Could not find vector for word ablebto.
Could not find vector for word madhubala.
Could not find vector for word jhz.
Could not find vector for word dstrubnc.
Could not find vector for word gubdagardi.
Could not find vector for word terrorisrs.
Could not find vector for word soldigers.
Could not find vector for word beant.
Could not find vector for word abiyaan.
Could not find vector for word jokeing.
Could not find vector for word kjos.
Could not find vector for word eserve.
Could not find vector for word jaganathar.
Could not find vector for word shaab.
Could not find vector for word availbel.
Could not find vector for word aoa.
Could not find vector for word mulism.
Could not find vector for word tumhein.
Could not find vector for word pareshaani.
Could not find vector for word malviya.
Could not find vector for word int

 70%|██████▉   | 16506/23600 [00:00<00:00, 23866.36it/s]


Could not find vector for word chamre.
Could not find vector for word phek.
Could not find vector for word phenka.
Could not find vector for word ashtha.
Could not find vector for word gaaye.
Could not find vector for word hatya.
Could not find vector for word amitab.
Could not find vector for word darshfor.
Could not find vector for word infrt.
Could not find vector for word preplaned.
Could not find vector for word kanshiram.
Could not find vector for word traitory.
Could not find vector for word mujeeb.
Could not find vector for word sponcer.
Could not find vector for word distub.
Could not find vector for word nesr.
Could not find vector for word orthodoxity.
Could not find vector for word terrorismm.
Could not find vector for word sidha.
Could not find vector for word tytler.
Could not find vector for word janakpuri.
Could not find vector for word hazaray.
Could not find vector for word sextoys.
Could not find vector for word andde.
Could not find vector for word onam.
Could not 

 84%|████████▍ | 19863/23600 [00:00<00:00, 21455.80it/s]

Could not find vector for word lokepal.
Could not find vector for word seculiyarism.
Could not find vector for word ramayans.
Could not find vector for word etlection.
Could not find vector for word pheri.
Could not find vector for word sunitha.
Could not find vector for word lakshmanan.
Could not find vector for word jashoda.
Could not find vector for word postmartom.
Could not find vector for word recoeds.
Could not find vector for word jiofi.
Could not find vector for word rajniti.
Could not find vector for word evrytime.
Could not find vector for word bundar.
Could not find vector for word gotthard.
Could not find vector for word shubhi.
Could not find vector for word pradhanmantri.
Could not find vector for word kyaun.
Could not find vector for word bolate.
Could not find vector for word kkahanewalonki.
Could not find vector for word bolalti.
Could not find vector for word virodhiyonko.
Could not find vector for word awaz.
Could not find vector for word nikesh.
Could not find vect

 97%|█████████▋| 22956/23600 [00:01<00:00, 19840.18it/s]


Could not find vector for word shrinken.
Could not find vector for word haijacking.
Could not find vector for word manojtiwari.
Could not find vector for word xcptng.
Could not find vector for word devoled.
Could not find vector for word virodh.
Could not find vector for word amby.
Could not find vector for word devlpmnt.
Could not find vector for word rcom.
Could not find vector for word kbw.
Could not find vector for word meatban.
Could not find vector for word are3.
Could not find vector for word incovinience.
Could not find vector for word shirkhurma.
Could not find vector for word dtaas.
Could not find vector for word mohmmad.
Could not find vector for word 6uyr.
Could not find vector for word narkasura.
Could not find vector for word hajamat.
Could not find vector for word walun.
Could not find vector for word unpad.
Could not find vector for word gawaar.
Could not find vector for word bhode.
Could not find vector for word brij.
Could not find vector for word bushan.
Could not f

100%|██████████| 23600/23600 [00:01<00:00, 19359.46it/s]

Could not find vector for word glhm35z95d.
Could not find vector for word butilised.
Could not find vector for word basatards.
Could not find vector for word eiwen.
Could not find vector for word kanhiya.
Could not find vector for word practicematch.
Could not find vector for word banonjnu.
Could not find vector for word lnngw1td76.
Could not find vector for word whereisnajeebahmed.
Could not find vector for word 7w8yu57gfu.
Could not find vector for word jaggu.
Could not find vector for word singhpramod278.
Could not find vector for word afzalguru.
Could not find vector for word rabbani.
Could not find vector for word ghayalonceagain.
Could not find vector for word continusly.
Could not find vector for word not2bhiredfromjnu.
Could not find vector for word ofrri9zzwg.
Could not find vector for word at1000.
Could not find vector for word bhai10000.
Could not find vector for word phodd.
Could not find vector for word kejeriwal.
Could not find vector for word occupyugc.
Could not find ve




In [6]:
print(embed_matrix.shape)

(23601, 300)


In [16]:
# Instantiate the model
model = models.build_pooled_gru(n_classes, transformer.rel_features, maxlen, embed_matrix, embed_size)

In [9]:
import numpy as np
X_train = transformer.texts_to_seq(np.concatenate((x_train, x_dev), axis=0))
Y_train = np.concatenate((y_train, y_dev), axis=0)
X_tra, X_val, y_tra, y_val =  train_test_split(X_train, Y_train, train_size=0.95, random_state=233)

In [17]:
epochs = 12

In [18]:
# RocAuc = evaluate.RocAucEvaluation(validation_data=(X_dev, y_dev), interval=1)
SumEval = evaluate.SummaryEvaluation(validation_data=(X_val, y_val), interval=1)
# Train the model
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[SumEval])

Train on 14250 samples, validate on 750 samples
Epoch 1/12

 F1Score - epoch: 1 - score:
             precision    recall  f1-score   support

          0       0.65      0.70      0.68       314
          1       0.54      0.60      0.56       277
          2       0.60      0.40      0.48       159

avg / total       0.60      0.60      0.59       750
 

Epoch 2/12

 F1Score - epoch: 2 - score:
             precision    recall  f1-score   support

          0       0.69      0.70      0.70       314
          1       0.57      0.54      0.55       277
          2       0.50      0.52      0.51       159

avg / total       0.61      0.61      0.61       750
 

Epoch 3/12

 F1Score - epoch: 3 - score:
             precision    recall  f1-score   support

          0       0.69      0.64      0.66       314
          1       0.53      0.57      0.55       277
          2       0.54      0.53      0.54       159

avg / total       0.60      0.59      0.59       750
 

Epoch 4/12

 F1Scor

In [20]:
fb_X_test_en = transformer.texts_to_seq(fb_x_test_en)
sm_X_test_en = transformer.texts_to_seq(sm_x_test_en)

Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name
Got no such name


In [23]:
fb_Y_pred_en = model.predict(fb_X_test_en, verbose=0)
sm_Y_pred_en = model.predict(sm_X_test_en, verbose=0)
coling.dump_coling_predictions(fb_doc_ids_en, fb_Y_pred_en, os.path.join(REPO_PATH, "datasets/fb_en_sub.csv"))
coling.dump_coling_predictions(sm_doc_ids_en, sm_Y_pred_en, os.path.join(REPO_PATH, "datasets/sm_en_sub.csv"))