In [1]:
#Loading Libraries
import numpy as np
import pandas as pd

In [2]:
# Read Data

data=pd.read_csv('axcs_train.csv')

data.head()

Unnamed: 0,ID,URL,Date,Title,InfoTheory,CompVis,Math,Abstract
0,cs-9301111,arxiv.org/abs/cs/9301111,1989-12-31,Nested satisfiability,0,0,0,Nested satisfiability A special case of the s...
1,cs-9301112,arxiv.org/abs/cs/9301112,1990-03-31,A note on digitized angles,0,0,0,A note on digitized angles We study the confi...
2,cs-9301113,arxiv.org/abs/cs/9301113,1991-07-31,Textbook examples of recursion,0,0,0,Textbook examples of recursion We discuss pro...
3,cs-9301114,arxiv.org/abs/cs/9301114,1991-10-31,Theory and practice,0,0,0,Theory and practice The author argues to Sili...
4,cs-9301115,arxiv.org/abs/cs/9301115,1991-11-30,Context-free multilanguages,0,0,0,Context-free multilanguages This article is a...


In [3]:
data.loc[:,'label'] = pd.Series(data[['InfoTheory','CompVis','Math']].values.tolist(), index=data.index)

In [4]:
data.tail()

Unnamed: 0,ID,URL,Date,Title,InfoTheory,CompVis,Math,Abstract,label
54726,no-14128412,arxiv.org/abs/1412.8412,2014-12-31,Sanitization of Call Detail Records via Differ...,0,0,0,Sanitization of Call Detail Records via Diffe...,"[0, 0, 0]"
54727,no-14053347,arxiv.org/abs/1405.3347,2014-12-31,Self-dual codes and quadratic residue codes ov...,1,0,1,Self-dual codes and quadratic residue codes o...,"[1, 0, 1]"
54728,no-150100212,arxiv.org/abs/1501.00212,2014-12-31,Set-merging for the Matching Algorithm of Mica...,0,0,0,Set-merging for the Matching Algorithm of Mic...,"[0, 0, 0]"
54729,no-150100203,arxiv.org/abs/1501.00203,2014-12-31,Small Cell Traffic Balancing Over Licensed and...,0,0,0,Small Cell Traffic Balancing Over Licensed an...,"[0, 0, 0]"
54730,no-14101009,arxiv.org/abs/1410.1009,2014-12-31,Uplink Scheduling for LTE Video Surveillance S...,0,0,0,Uplink Scheduling for LTE Video Surveillance ...,"[0, 0, 0]"


In [5]:
data = data[['Abstract','label']]

In [6]:
data.head()

Unnamed: 0,Abstract,label
0,Nested satisfiability A special case of the s...,"[0, 0, 0]"
1,A note on digitized angles We study the confi...,"[0, 0, 0]"
2,Textbook examples of recursion We discuss pro...,"[0, 0, 0]"
3,Theory and practice The author argues to Sili...,"[0, 0, 0]"
4,Context-free multilanguages This article is a...,"[0, 0, 0]"


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

Using TensorFlow backend.


In [9]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data.label)
labels = multilabel_binarizer.classes_

In [10]:
maxlen = 180 #hyperparameter
max_words = 5000 #hyperparameter
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(data.Abstract)

In [11]:
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [13]:
from sklearn.model_selection import train_test_split
x = get_features(data.Abstract)
y = multilabel_binarizer.transform(data.label)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [16]:
#cnn model
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

In [17]:
filter_length = 300

In [32]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(2))
model.add(Activation('sigmoid'))

In [33]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 602       
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 118,902
Trainable params: 118,902
Non-trainable params: 0
_________________________________________________________________


In [34]:
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=1), 
    ModelCheckpoint(filepath='model.h5', save_best_only=True)
]

In [36]:
history = model.fit(x_train, y_train,
                    epochs=2,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 39405 samples, validate on 4379 samples
Epoch 1/2
Epoch 2/2


In [38]:
import keras
cnn_model = keras.models.load_model('model.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.16333399078851857
categorical_accuracy: 1.0
