In [1]:
import torch
import os
from ipywidgets import FloatProgress
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as mt
from tqdm.auto import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import re
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
from gensim.models import Word2Vec
from xgboost import XGBClassifier
import keras
from keras.models import load_model
from keras.layers import Activation, Dense, TimeDistributed, Dropout, Embedding, LSTM, GRU, Bidirectional
from keras.models import Sequential
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/iebi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('data/task2_trainset.csv', dtype=str)
dataset.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 2
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,THEORETICAL
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,THEORETICAL
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,ENGINEERING
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,EMPIRICAL
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,ENGINEERING


In [3]:
dataset.drop(["Title", "Authors", "Categories", "Created Date"], axis=1, inplace=True)

### 資料切割

In [4]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv('trainset.csv', index=False)
validset.to_csv('validset.csv', index=False)

In [5]:
dataset = pd.read_csv('data/task2_public_testset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset.to_csv('testset.csv',index=False)
testset = dataset.copy()

In [6]:
from multiprocessing import Pool
from nltk.tokenize import word_tokenize

def get_word(data_path):
    df = pd.read_csv(data_path, dtype=str)
        
    sent_list = []
    words = set()
    for item in df["Abstract"]:
        sent_list += item.split("$$$")
    for sent in sent_list:
        words |= set(word_tokenize(sent))

    return words

In [7]:
%%time
words = get_word("trainset.csv")

CPU times: user 6.02 s, sys: 0 ns, total: 6.02 s
Wall time: 6.08 s


### Glove Parsing

In [8]:
embeddings_index = {}
stem = PorterStemmer()
f = open('data/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


### Glove Embedding/Stemmer

In [9]:
def qq(text):
    return text.replace("$$$", " ")
aa = trainset.apply(qq)

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainset.Abstract)
# sequences = tokenizer.texts_to_sequences(trainset.Abstract)

vocab_size = len(tokenizer.word_index)+1
print("Total words", vocab_size)

Total words 26582


In [11]:
%%time
SEQUENCE_LENGTH = 500
x_train = pad_sequences(tokenizer.texts_to_sequences(trainset.Abstract), maxlen=SEQUENCE_LENGTH)
x_val = pad_sequences(tokenizer.texts_to_sequences(validset.Abstract), maxlen=SEQUENCE_LENGTH)

CPU times: user 458 ms, sys: 7.71 ms, total: 465 ms
Wall time: 464 ms


In [12]:
print('Shape of x_data tensor:', x_train.shape)
print('Shape of x_val tensor:', x_val.shape)

Shape of x_data tensor: (6300, 500)
Shape of x_val tensor: (700, 500)


In [13]:
%%time
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

CPU times: user 55.1 ms, sys: 24 ms, total: 79 ms
Wall time: 78.3 ms


In [24]:
len(tokenizer.word_index)

26581

In [22]:
embeddings_index.get('the').shape

(300,)

In [20]:
tokenizer.word_index.items()



In [14]:
embedding_matrix.shape

(26582, 300)

### Preprocessing(Label onehot)

In [15]:
from tqdm import tqdm_notebook as tqdm

def label_to_onehot(labels):
    
    label_dict = {'THEORETICAL': 0, 'ENGINEERING':1, 'EMPIRICAL':2, 'OTHERS':3}
    onehot = [0,0,0,0]
    for l in labels.split():
        onehot[label_dict[l]] = 1
    return onehot

def label_to_all(datasets):
    label = []
    for row in datasets["Task 2"]:
        label.append(label_to_onehot(row))
    return label
        
def sentence_to_indices(sentence, word_dict): # 句子做tokenize , 然後再mapping --> index

    return [word_dict.to_index(word) for word in word_tokenize(sentence)]
    
def get_dataset(data_path, word_dict):
    
    dataset = pd.read_csv(data_path, dtype=str)
    results = preprocess_sample(dataset, word_dict)
    return results

def preprocess_sample(data, word_dict):
    processed_list = []
    for row in data.iterrows():
        processed = {}
        processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in row[1]["Abstract"].split("$$$")]
        if 'Task 2' in data:
            processed['Label'] = [label_to_onehot(row[1]["Task 2"])]
        processed_list.append(processed)
    return processed_list

In [16]:
y_train =  np.array(label_to_all(trainset))
y_val = np.array(label_to_all(validset))

In [17]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_val", x_val.shape)
print("y_val", y_val.shape)

x_train (6300, 500)
y_train (6300, 4)

x_val (700, 500)
y_val (700, 4)


### Model

In [18]:
from keras.layers import Embedding, Flatten
print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False))
# model.add(Dropout(0.2))
# model.add(Bidirectional(GRU(512, return_sequences=True)))
model.add(Bidirectional(GRU(512)))
# model.add(Dense(4, activation='softmax'))
model.add(Dense(4,activation="sigmoid"))
model.summary()

Build model...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 300)          7974600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              2497536   
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 4100      
Total params: 10,476,236
Trainable params: 2,501,636
Non-trainable params: 7,974,600
_________________________________________________________________


In [19]:
EPOCHS = 25
BATCH_SIZE = 8
filepath="model_{epoch:02d}-{val_accuracy:.2f}.hdf5"
opt = keras.optimizers.Adam(lr=2e-4)
checkpoint = keras.callbacks.ModelCheckpoint(os.path.join("keras_model", filepath), monitor='val_accuracy',verbose=1, 
                            save_best_only=True)
model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer=opt)
history = model.fit(x_train, y_train,
                    validation_data= (x_val, y_val),
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE,
                    callbacks=[checkpoint],
                    verbose=1)

Train on 6300 samples, validate on 700 samples
Epoch 1/25
 984/6300 [===>..........................] - ETA: 5:20 - loss: 0.5533 - accuracy: 0.6758

KeyboardInterrupt: 

In [None]:
test_stem = testset["Abstract"].apply(clean_text)

In [None]:
X_test = pad_sequences(tokenizer.texts_to_sequences(test_stem), maxlen=SEQUENCE_LENGTH)

In [None]:
X_test.shape

In [None]:
model = load_model('keras_model/model_02-0.77.hdf5')

In [None]:
y_test = model.predict(X_test, verbose=1)

In [None]:
def proba_to_bool(prediction, threshold=0.5):
    result = np.zeros((prediction.shape[0], prediction.shape[1]))
    for i in range(prediction.shape[0]):
        for j in range(prediction.shape[1]):
            if prediction[i][j] > threshold:
                result[i][j] = 1
            else:
                result[i][j] = 0
    return result

In [None]:
result = proba_to_bool(y_test)

### Plot

In [None]:
epochs = range(1, EPOCHS+1)
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]
plt.plot(epochs, acc, 'bo')
plt.plot(epochs, val_acc, 'r')
plt.title("Accuracy")
plt.figure()
plt.plot(epochs, loss, 'bo')
plt.plot(epochs, val_loss, 'r')
plt.title("Loss")
plt.show()

In [None]:
def SubmitGenerator(prediction, sampleFile, public=True, filename='prediction.csv'):
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit['order_id'] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
        submit['THEORETICAL'] = list(prediction[:,0]) + [0]*redundant
        submit['ENGINEERING'] = list(prediction[:,1]) + [0]*redundant
        submit['EMPIRICAL'] = list(prediction[:,2]) + [0]*redundant
        submit['OTHERS'] = list(prediction[:,3]) + [0]*redundant
    else:
        submit['THEORETICAL'] = [0]*redundant + list(prediction[:,0])
        submit['ENGINEERING'] = [0]*redundant + list(prediction[:,1])
        submit['EMPIRICAL'] = [0]*redundant + list(prediction[:,2])
        submit['OTHERS'] = [0]*redundant + list(prediction[:,3])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename,index=False)

In [None]:
SubmitGenerator(result, 
                'data/task2_sample_submission.csv',
                True, 
                'data/task2_submission.csv')