<a href="https://colab.research.google.com/github/talhaanwarch/DeftEval2020/blob/master/deftEval_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/dataset/deft_eval

/content/drive/My Drive/dataset/deft_eval


In [3]:
ls

[0m[01;34mdev[0m/  [01;34mtrain[0m/


In [0]:
import pandas as pd
import glob
import string
import re
import numpy as np

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
train_files = glob.glob('train/' + "*.deft")
li = []
for filename in train_files:
    df = pd.read_csv(filename, sep='\t',index_col=None, header=None,names=['sentence','label'])
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
y_train=frame['label']
corpus_train = []
for j in frame['sentence']:
    df=j.lower()
    df=df.translate(str.maketrans('', '', string.punctuation))
    df=df.replace( 'link ','')
    df=''.join([i for i in df if not i.isdigit()]) 
    df=" ".join(df.split())
    df = [lemmatizer.lemmatize(token) for token in df.split(" ")]
    df=[word for word in df if not word in stop_words]
    df=" ".join(df)
    corpus_train.append(df)

In [7]:
len(corpus_train),corpus_train[0],y_train[0]

(16659,
 'science includes diverse field astronomy biology computer science geology logic physic chemistry mathematics',
 0)

In [0]:
dev_files = glob.glob('dev/' + "*.deft")
li = []
for filename in dev_files:
    df = pd.read_csv(filename, sep='\t',index_col=None, header=None,names=['sentence','label'])
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
y_dev=frame['label']
corpus_dev = []
for j in frame['sentence']:
    df=j.lower()
    df=df.translate(str.maketrans('', '', string.punctuation))
    df=df.replace( 'link ','')
    df=''.join([i for i in df if not i.isdigit()]) 
    df=" ".join(df.split())
    df = [lemmatizer.lemmatize(token) for token in df.split(" ")]
    df=[word for word in df if not word in stop_words]
    df=" ".join(df)
    corpus_dev.append(df)

In [9]:
len(corpus_dev),corpus_dev[0],y_dev[0]

(810,
 'becomes clear definition application scientific method play major role science',
 0)

In [10]:
import collections
collections.Counter(y_train)

Counter({0: 11090, 1: 5569})

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
max_words = 5000 #frequency of words to be kept
max_len = 300
tokenize = Tokenizer(num_words=max_words)
tokenize.fit_on_texts(corpus_train)
sequences = tokenize.texts_to_sequences(corpus_train)
X_train = sequence.pad_sequences(sequences,maxlen=max_len)


Using TensorFlow backend.


In [0]:
test_sequences = tokenize.texts_to_sequences(corpus_dev)
X_dev = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [13]:
from keras.models import Sequential
from keras.layers import Embedding,CuDNNLSTM,Dense,Dropout,Bidirectional
from keras.optimizers import RMSprop,Adam
from sklearn.utils import class_weight
from keras.callbacks import EarlyStopping
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model=Sequential()
model.add(Embedding(input_dim=max_words,output_dim=100,input_length=max_len))
model.add((CuDNNLSTM(64)))
#model.add(CuDNNLSTM(32))
#model.add(Dense(50,activation='sigmoid'))
model.add(Dense(20,activation='relu'))

model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weights=dict(enumerate(class_weights))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          500000    
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 64)                42496     
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 543,817
Trainable params: 543,817
Non-trainable params: 0
_________________________________________________________________


In [0]:
es = EarlyStopping(monitor='val_loss', mode='min', min_delta=0,patience=3,restore_best_weights=True)

In [15]:
model.fit(X_train,y_train,batch_size=130,epochs=30,verbose=2,class_weight=class_weights,validation_data=(X_dev,y_dev),callbacks=[es])

Train on 16659 samples, validate on 810 samples
Epoch 1/30
 - 8s - loss: 0.6437 - acc: 0.6237 - val_loss: 0.6054 - val_acc: 0.6728
Epoch 2/30
 - 6s - loss: 0.5486 - acc: 0.7349 - val_loss: 0.6125 - val_acc: 0.6457
Epoch 3/30
 - 6s - loss: 0.5074 - acc: 0.7655 - val_loss: 0.5130 - val_acc: 0.7432
Epoch 4/30
 - 6s - loss: 0.4678 - acc: 0.7924 - val_loss: 0.6572 - val_acc: 0.7012
Epoch 5/30
 - 6s - loss: 0.4285 - acc: 0.8158 - val_loss: 0.5316 - val_acc: 0.7346
Epoch 6/30
 - 6s - loss: 0.3923 - acc: 0.8352 - val_loss: 0.5767 - val_acc: 0.7395


<keras.callbacks.History at 0x7f3f3e78e8d0>

In [16]:

print(model.evaluate(X_dev,y_dev))

[0.5130385905136297, 0.7432098762488659]


In [17]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_dev, batch_size=30, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
y_pred = (y_pred > 0.5)

print(classification_report(y_dev, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       537
           1       0.63      0.57      0.60       273

    accuracy                           0.74       810
   macro avg       0.71      0.70      0.70       810
weighted avg       0.74      0.74      0.74       810



In [0]:
import keras
keras.backend.clear_session()