In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [None]:
df.shape

(20972, 9)

In [None]:
df.describe()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
count,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0
mean,10486.5,0.409784,0.286716,0.267881,0.248236,0.02799,0.011873
std,6054.239259,0.491806,0.452238,0.442866,0.432,0.164947,0.108317
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5243.75,0.0,0.0,0.0,0.0,0.0,0.0
50%,10486.5,0.0,0.0,0.0,0.0,0.0,0.0
75%,15729.25,1.0,1.0,1.0,0.0,0.0,0.0
max,20972.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
df.dtypes

ID                       int64
TITLE                   object
ABSTRACT                object
Computer Science         int64
Physics                  int64
Mathematics              int64
Statistics               int64
Quantitative Biology     int64
Quantitative Finance     int64
dtype: object

In [None]:
for each in df.columns[3:]:
    print (each, df[each].sum())

Computer Science 8594
Physics 6013
Mathematics 5618
Statistics 5206
Quantitative Biology 587
Quantitative Finance 249


In [None]:
for each in df.columns[3:]:
    print (each, df[each].sum()/df.shape[0])

Computer Science 0.4097844745374785
Physics 0.28671562082777036
Mathematics 0.2678809841693687
Statistics 0.24823574289528896
Quantitative Biology 0.027989700553118443
Quantitative Finance 0.011872973488460805


In [None]:
df.columns = [each.lower() for each in df.columns]

In [None]:
max(df['title'].apply(lambda x: len(x)))

239

In [None]:
max(df['abstract'].apply(lambda x: len(x)))

2761

In [None]:
# Combine Title and Text - Can try a small network seperately on title
# Seperate train and test data
# Preprocess, tokenize and text to seq, padding
# Generate labels - 6 columns of output
# Encoding into glove vectors
# Define multioutput model
# Train and predict

## Combine Title and text

In [None]:
df['abstract']  = df['title'].apply(lambda x: x + ". ") + df['abstract']

In [None]:
df.head()

Unnamed: 0,id,title,abstract,computer science,physics,mathematics,statistics,quantitative biology,quantitative finance
0,1,Reconstructing Subject-Specific Effect Maps,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation Invariance Neural Network. Rotation...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,A finite element approximation for the stochas...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0


## Seperate Train and Test Data

In [None]:
test_size = 0.2

indices = list(df.index)
np.random.shuffle(indices)

test_indices  = indices[:int(test_size*len(indices))]
train_indices = indices[int(test_size*len(indices)):]

In [None]:
len(train_indices), len(test_indices)

(16778, 4194)

In [None]:
train_indices[:5]

[8875, 16837, 19082, 3432, 11670]

In [None]:
train_texts = df.loc[train_indices, 'abstract']
test_texts  = df.loc[test_indices, 'abstract']

In [None]:
train_texts[3]

'A finite element approximation for the stochastic Maxwell--Landau--Lifshitz--Gilbert system.   The stochastic Landau--Lifshitz--Gilbert (LLG) equation coupled with the\nMaxwell equations (the so called stochastic MLLG system) describes the creation\nof domain walls and vortices (fundamental objects for the novel nanostructured\nmagnetic memories). We first reformulate the stochastic LLG equation into an\nequation with time-differentiable solutions. We then propose a convergent\n$\\theta$-linear scheme to approximate the solutions of the reformulated system.\nAs a consequence, we prove convergence of the approximate solutions, with no or\nminor conditions on time and space steps (depending on the value of $\\theta$).\nHence, we prove the existence of weak martingale solutions of the stochastic\nMLLG system. Numerical results are presented to show applicability of the\nmethod.\n'

## Preprocess, tokenize and text to seq 

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#Parameters
OOV_TOK = '<OOV>'
padding_type='post'

In [None]:

max([len(x) for x in train_texts])

2862

In [None]:
#Tokenization and Texts to sequences
tokenizer = Tokenizer(oov_token = OOV_TOK)
tokenizer.fit_on_texts(train_texts)

train_seq = tokenizer.texts_to_sequences(train_texts)
test_seq  = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index

In [None]:
# Padding
max_length = max([len(x) for x in train_seq])
print (max_length)
train_padded = pad_sequences(train_seq, padding=padding_type, maxlen=max_length)
test_padded  = pad_sequences(test_seq, padding=padding_type, maxlen=max_length)

467


In [None]:
test_padded.shape, train_padded.shape

((4194, 467), (16778, 467))

In [None]:
vocab_size = len(word_index) + 1

## Generate labels - 6 columns of output

In [None]:
#6 columns of onehot vectors, each one for - computer science,physics,mathematics,statistics,quantitative biology,quantitative finance
n_labels = 6
labels = np.zeros((df.shape[0], n_labels))
for i,each_label in enumerate(df.columns[3:]):
  print (each_label,i)
  #current_label = np.zeros((df.shape[0]))
  #current_label = df.loc[:,each_label].values
  #labels = np.append(labels, current_label)
  labels[:,i] = df.loc[:,each_label].values

computer science 0
physics 1
mathematics 2
statistics 3
quantitative biology 4
quantitative finance 5


In [None]:
labels[:5]

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0.]])

In [None]:
(labels[:,0] == df.iloc[:,3]).values.sum()

20972

In [None]:
#Train-Test labels
train_labels = labels[train_indices,:]
test_labels  = labels[test_indices,:]

In [None]:
train_labels.shape, test_labels.shape

((16778, 6), (4194, 6))

## Import Glove Embeddings

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-08-21 04:16:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-08-21 04:16:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-08-21 04:16:26--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall('.')
zip_ref.close()

In [None]:
with open('glove.6B.100d.txt', 'r') as file:
  rows = file.readlines()
embeddings_matrix = {}
for each_line in rows:
  row = each_line.split(" ")
  embeddings_matrix[row[0]] = [float(i) for i in row[1:]] 

In [None]:
len(embeddings_matrix)

400000

In [None]:
## Embedding weights
embedding_dim = 100
embedding_weights = np.zeros((vocab_size, embedding_dim))
for i, word in enumerate(word_index):
  embedding_vec = embeddings_matrix.get(word)
  if embedding_vec is not None:
    embedding_weights[i] = embedding_vec

In [None]:
vocab_size, len(embedding_weights)

(51757, 51757)

## Define Multioutput Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam

In [None]:
from keras.optimizers import SGD
sgd = SGD(lr=0.1)

In [None]:
def define_model(max_length, vocab_size,n_output=1, chanDim=-1):

  #Feature Extraction
  inputs1    = Input(shape=(max_length,))
  embedding1 = Embedding(vocab_size, embedding_dim, weights=[embedding_weights], trainable=True)(inputs1)
  conv1      = Conv1D(filters=64, kernel_size=5, activation='relu')(embedding1)
  bn1        = BatchNormalization(axis=chanDim)(conv1)
  pool1      = MaxPooling1D(pool_size=2)(bn1)
  conv2      = Conv1D(filters=64, kernel_size=5, activation='relu')(pool1)
  bn2        = BatchNormalization(axis=chanDim)(conv2)
  pool2      = MaxPooling1D(pool_size=2)(bn2)
  flat1      = Flatten()(pool2)

  #Dense Classifier 1
  dense1_1   = Dense(32, activation='relu')(flat1)
  output1    = Dense(n_output, activation='sigmoid', name='output_1')(dense1_1)

  #Dense Classifier 2
  dense2_1   = Dense(32, activation='relu')(flat1)
  output2    = Dense(n_output, activation='sigmoid',name='output_2')(dense2_1)

  #Dense Classifier 3
  dense3_1   = Dense(32, activation='relu')(flat1)
  output3    = Dense(n_output, activation='sigmoid',name='output_3')(dense3_1)

  #Dense Classifier 4
  dense4_1   = Dense(32, activation='relu')(flat1)
  output4    = Dense(n_output, activation='sigmoid',name='output_4')(dense4_1)

  #Dense Classifier 5
  dense5_1   = Dense(32, activation='relu')(flat1)
  output5    = Dense(n_output, activation='sigmoid',name='output_5')(dense5_1)

  #Dense Classifier 6
  dense6_1   = Dense(32, activation='relu')(flat1)
  output6    = Dense(n_output, activation='sigmoid',name='output_6')(dense6_1)

  model = Model(inputs = inputs1, outputs=[output1, 
                                           output2, 
                                           output3, 
                                           output4, 
                                           output5, 
                                           output6])
  model.compile(loss={"output_1" : 'binary_crossentropy',
                      "output_2" : 'binary_crossentropy',
                      "output_3" : 'binary_crossentropy',
                      "output_4" : 'binary_crossentropy',
                      "output_5" : 'binary_crossentropy',
                      "output_6" : 'binary_crossentropy'},
                loss_weights = {"output_1" : 0.1,
                                "output_2" : 0.15,
                                "output_3" : 0.1,
                                "output_4" : 0.15,
                                "output_5" : 0.25,
                                "output_6" : 0.25                    
                },
                optimizer=sgd, 
                metrics={"output_1" : 'accuracy',
                         "output_2" : 'accuracy',
                         "output_3" : 'accuracy',
                         "output_4" : 'accuracy',
                         "output_5" : 'accuracy',
                         "output_6" : 'accuracy'})
  return model

In [None]:
labels.shape

(20972, 6)

In [None]:
model = define_model(max_length, vocab_size,n_output=1, chanDim=-1)

In [None]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 467)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 467, 100)     5175700     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 463, 64)      32064       embedding_1[0][0]                
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 463, 64)      256         conv1d_2[0][0]                   
_______________________________________________________________________________________

In [None]:
history = model.fit(train_padded,
          [np.array(train_labels[:,i]) for i in range(n_labels)], 
          epochs=20, 
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
test_padded.shape, train_padded.shape

((4194, 467), (16778, 467))

In [None]:
y_hat = model.predict(test_padded)

In [None]:
(np.round(np.array(y_hat).flatten()) == np.array(test_labels).flatten()).sum()/test_labels.shape[0]*test_labels.shape[1]

23.842632331902717

In [None]:
def preprocess(df, max_length,padding_type):
  df['ABSTRACT']  = df['TITLE'].apply(lambda x: x + ". ") + df['ABSTRACT']
  texts = df.loc[:, 'ABSTRACT']
  tokenizer.fit_on_texts(texts)
  seq = tokenizer.texts_to_sequences(texts)
  padded = pad_sequences(seq, padding=padding_type, maxlen=max_length)
  return padded

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
padded = preprocess(df_test, max_length,padding_type)

In [None]:
predicted = model.predict(np.array(padded))
df_sub = pd.DataFrame(columns=['ID','Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance'])
df_sub['ID'] = df_test['ID']
df_sub.iloc[:,1:] = np.round(predicted)

In [None]:
df_sub.to_csv('submission.csv', index=False)