<a href="https://colab.research.google.com/github/talhaanwarch/DeftEval2020/blob/master/deftEval_GloVe_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/dataset/

/content/drive/My Drive/dataset


In [3]:
ls

[0m[01;34mdeft_eval[0m/  [01;34mEEG[0m/  images.zip  [01;34mOLID[0m/


In [0]:
import pandas as pd
import glob
import string
import re
import numpy as np

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
train_files = glob.glob('deft_eval/train/' + "*.deft")
li = []
for filename in train_files:
    df = pd.read_csv(filename, sep='\t',index_col=None, header=None,names=['sentence','label'])
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
y_train=frame['label']
corpus_train = []
for j in frame['sentence']:
    df=j.lower()
    df=df.translate(str.maketrans('', '', string.punctuation))
    df=df.replace( 'link ','')
    df=''.join([i for i in df if not i.isdigit()]) 
    df=" ".join(df.split())
    df = [lemmatizer.lemmatize(token) for token in df.split(" ")]
    df=[word for word in df if not word in stop_words]
    df=" ".join(df)
    corpus_train.append(df)

In [7]:
len(corpus_train),corpus_train[0],y_train[0]

(16659,
 'science includes diverse field astronomy biology computer science geology logic physic chemistry mathematics',
 0)

In [0]:
dev_files = glob.glob('deft_eval/dev/' + "*.deft")
li = []
for filename in dev_files:
    df = pd.read_csv(filename, sep='\t',index_col=None, header=None,names=['sentence','label'])
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
y_dev=frame['label']
corpus_dev = []
for j in frame['sentence']:
    df=j.lower()
    df=df.translate(str.maketrans('', '', string.punctuation))
    df=df.replace( 'link ','')
    df=''.join([i for i in df if not i.isdigit()]) 
    df=" ".join(df.split())
    df = [lemmatizer.lemmatize(token) for token in df.split(" ")]
    df=[word for word in df if not word in stop_words]
    df=" ".join(df)
    corpus_dev.append(df)

In [9]:
len(corpus_dev),corpus_dev[0],y_dev[0]

(810,
 'becomes clear definition application scientific method play major role science',
 0)

In [10]:
import collections
collections.Counter(y_train)

Counter({0: 11090, 1: 5569})

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
max_words = 10000 #frequency of words to be kept
max_len = 300
tokenize = Tokenizer(num_words=max_words)
tokenize.fit_on_texts(corpus_train)
sequences = tokenize.texts_to_sequences(corpus_train)
X_train = sequence.pad_sequences(sequences,maxlen=max_len)
word_index = tokenize.word_index

Using TensorFlow backend.


In [12]:
import os
embeddings_index = {}
f = open('OLID/glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
num_words = min(max_words, len(word_index)) + 1
print(num_words)

embedding_dim = 100

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

5001


In [0]:
test_sequences = tokenize.texts_to_sequences(corpus_dev)
X_dev = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [15]:
from keras.models import Sequential
from keras.layers import Embedding,CuDNNGRU,Dense,Dropout,Bidirectional,SpatialDropout1D,GlobalMaxPool1D
from keras.optimizers import RMSprop,Adam
from sklearn.utils import class_weight
from keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.initializers import Constant
tf.logging.set_verbosity(tf.logging.ERROR)

model = Sequential()
model.add(Embedding(num_words,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_len,
                    trainable=True))
model.add( SpatialDropout1D(0.2))
model.add((CuDNNGRU(50, return_sequences = True)))
model.add(GlobalMaxPool1D())
#model.add(Dense(80, activation="sigmoid"))
model.add(Dense(30, activation="sigmoid"))

model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer=Adam(0.0001), metrics=['accuracy'])

class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weights=dict(enumerate(class_weights))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          500100    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 100)          0         
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, 300, 50)           22800     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                1530      
_________________________________________________________________
dropout_1 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                

In [0]:
es = EarlyStopping(monitor='val_loss', mode='min', min_delta=0,patience=5,restore_best_weights=True)

In [17]:
model.fit(X_train,y_train,batch_size=30,epochs=50,verbose=2,class_weight=class_weights,validation_data=(X_dev,y_dev),callbacks=[es])

Train on 16659 samples, validate on 810 samples
Epoch 1/50
 - 32s - loss: 0.7154 - acc: 0.5569 - val_loss: 0.6623 - val_acc: 0.6333
Epoch 2/50
 - 31s - loss: 0.6759 - acc: 0.5749 - val_loss: 0.6365 - val_acc: 0.6420
Epoch 3/50
 - 31s - loss: 0.6503 - acc: 0.6177 - val_loss: 0.6165 - val_acc: 0.6593
Epoch 4/50
 - 31s - loss: 0.6331 - acc: 0.6388 - val_loss: 0.5837 - val_acc: 0.6963
Epoch 5/50
 - 31s - loss: 0.6182 - acc: 0.6585 - val_loss: 0.5976 - val_acc: 0.6753
Epoch 6/50
 - 31s - loss: 0.6051 - acc: 0.6716 - val_loss: 0.5755 - val_acc: 0.6840
Epoch 7/50
 - 31s - loss: 0.5929 - acc: 0.6813 - val_loss: 0.5738 - val_acc: 0.6889
Epoch 8/50
 - 31s - loss: 0.5866 - acc: 0.6864 - val_loss: 0.5482 - val_acc: 0.7173
Epoch 9/50
 - 31s - loss: 0.5748 - acc: 0.6996 - val_loss: 0.5641 - val_acc: 0.6827
Epoch 10/50
 - 31s - loss: 0.5657 - acc: 0.6991 - val_loss: 0.5329 - val_acc: 0.7235
Epoch 11/50
 - 31s - loss: 0.5555 - acc: 0.7117 - val_loss: 0.5476 - val_acc: 0.7062
Epoch 12/50
 - 31s - loss:

<keras.callbacks.History at 0x7f7a704eceb8>

In [18]:

print(model.evaluate(X_dev,y_dev))

[0.4997013892656491, 0.7567901231624462]


In [19]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_dev, batch_size=30, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
y_pred = (y_pred > 0.5)

print(classification_report(y_dev, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.79      0.81       537
           1       0.63      0.69      0.66       273

    accuracy                           0.76       810
   macro avg       0.73      0.74      0.73       810
weighted avg       0.76      0.76      0.76       810



In [0]:
import keras
keras.backend.clear_session()