In [None]:
"""The goal of this lab is to classify Wikipedia abstracts about people by their professions. For example, the professions of Elvis Presley are "singer" and "actor".

=== Input ===

The input for training is a file wiki-train.json, which contains Wikipedia abstracts in the following form:
   {"title": "George_Washington",
    "summary": "George Washington was one of the ..."
    "occupations": ["yago:politician"]}

The input for testing is a file wiki-test.json, which contains Wikipedia abstracts of the same shape without the occupations:

   {"title": "Douglas_Adams",
    "summary": "Douglas Noel Adams was ..."}

=== Output ===

The output shall be a JSON file that assigns each Wikipedia abstract to a set of occupations:
   {"title": "Douglas_Adams",
    "occupations": ["Q36180", "Q28389"]}

We provide a gold standard of this form for the development input file.

=== Datasets ===

We provide 3 datasets:
1) a training dataset, which has the labels
2) a development dataset, which has the labels
3) a testing dataset, which does not have the labels, and which we use for grading

=== What to do ===

Adapt the method create_model(), so that it creates a neural network model that classifies the sentence.
There is no need to modify the other parts of the code -- although you are allowed to do so.

=== Suggestions ===
1) Select a suitable theta value
Reference: held-out set, cross validation, grid search...

2) Use pre-trained embeddings
reference: word2vector, GloVe, FastText...

3) Add extra features
reference: stop words, part-of-speech...

4) Try other neural networks
reference: CNN, RNN, Attention, Transformer

5) Avoid overfitting
reference: regularization, dropout...

6) Adjust other parameters
reference: learning rate, batch_size, epoch, layer's dimension

==== Submission ===

1) Take your code, any necessary resources to run the code, and the output of your code on the test dataset (no need to put the other datasets!)
2) ZIP these files in a file called firstName_lastName.zip
3) submit it here before the deadline announced during the lab:

https://www.dropbox.com/request/zwBcRYj17giDjCyPqFQM

"""

In [1]:
# Install necessary modules
import sys
import subprocess
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#file1 = open('requirements.txt', 'r')
#requirements = file1.readlines()
#for req in requirements:
#    reqs = subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', req.strip("\n"), '--quiet'])

In [2]:
"""
This cell imports modules necessary to run this lab.
"""
import gzip
import json
import nltk
from tqdm import tqdm
import numpy as np
import keras
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
"""
This cell provides some basic function to extracting occupations.
There is no need to modify this file unless you want.
"""

class InputSample(object):
    def __init__(self, title, summary, occupation):
        self.title = title
        self.summary = summary
        self.occupation = occupation


def pad_sentence(sentence, max_len):
    '''
    make all sentences have the same length
    :param sentence:
    :param max_len:
    :return:
    '''
    seg_id = pad_sequences([sentence], maxlen=max_len, padding='post')
    return seg_id[0]


def get_label():
    occupations = [
        'yago:Politician',
        'yago:Researcher',
        'yago:Football_player',
        'yago:Writer',
        'yago:Actor',
        'yago:Painter',
        'yago:Journalist',
        'yago:University_teacher',
        'yago:Singer',
        'yago:Poet',
        'yago:Composer',
        'yago:Military_personnel',
        'yago:Lawyer',
        'yago:Film_actor',
        'yago:Businessperson',
        'yago:Historian',
        'yago:Musician',
        'yago:Film_director',
        'yago:Screenwriter',
        'yago:Physician'
    ]

    labels = {occ_id: index for index, occ_id in enumerate(occupations)}
    id_to_labels = {index: occ_id for index, occ_id in enumerate(occupations)}
    return labels, id_to_labels


def load_data(filename):
    '''
    load original data
    :param filename:
    :return:
    '''
    with gzip.open(filename, 'rt') as fp:
        for line in fp:
            people = json.loads(line)
            occ_key = 'occupations'
            occupations = people[occ_key] if occ_key in people else None
            sample = InputSample(people['title'], people['summary'], occupations)
            yield sample


def gen_vocabulary(data_file, vocab_file):
    '''
    generate a word list given an input corpus
    :param data_file:
    :param vocab_file:
    :return:
    '''
    vocab = set()
    for sample in tqdm(load_data(data_file)):
        sentence = str.lower(sample.summary)
        tokens = nltk.word_tokenize(sentence)
        vocab.update(set(tokens))

    with open(vocab_file, 'w', encoding='utf8')as f:
        f.write('\n'.join(list(vocab)))

    print('done! The size of vocabulary is {a}.'.format(a=len(vocab)))


def load_vocabulary(vocab_file):
    '''
    load vocabulary and create an id for each token.
    <pad> means padding token, <unk> means unknown token
    :param vocab_file:
    :return:
    '''
    vocab_to_id = dict()
    with open(vocab_file, encoding='utf8')as f:
        words = f.readlines()
        for w_id, word in enumerate(words):
            word = word.replace('\n', '')
            vocab_to_id[word] = w_id+1
    vocab_to_id['<pad>'] = 0
    vocab_to_id['<unk>'] = len(vocab_to_id)
    return vocab_to_id


def read_dataset(data_file, vocab_to_id, sent_len, debug=False):
    '''
    read training set or test set
    :param data_file:
    :param vocab_to_id:
    :param sent_len: the
    :param debug: load only a small fraction of samples to debug
    :return: model's input and labels
    need about 1min31s for training set and 2min for test set
    '''

    labels, _ = get_label()
    unknown_id = len(vocab_to_id) - 1
    data_x, data_y = list(), list()
    cnt = 0
    
    for sample in tqdm(load_data(data_file)):
        # print(sample)

        # for debugging
        cnt += 1
        if debug and cnt > 100:
            break

        summary = str.lower(sample.summary)
        tokens = nltk.word_tokenize(summary)
        token_ids = [vocab_to_id.get(t, unknown_id) for t in tokens]
        token_ids = pad_sentence(token_ids, sent_len)
        data_x.append(token_ids)
        occupations = sample.occupation

        # train
        if occupations:
            y_vector = [1 if label in occupations else 0 for label in labels]
            data_y.append(y_vector)
        # test
        else:
            data_y.append(0)

    return np.array(data_x), np.array(data_y)


def f1_score(true_labels, pred_labels):
    """Compute the F1 score."""
    nb_correct, nb_pred, nb_true = 0, 0, 0
    for true, pred in zip(true_labels, pred_labels):
        nb_correct += len(true & pred)
        nb_pred += len(pred)
        nb_true += len(true)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0
    score = 2 * p * r / (p + r) if p + r > 0 else 0

    return score, p, r



In [4]:
# Import the necessary modules and methods

# Import functions from keras
from keras.layers import Input, Embedding, Dense
from keras.models import Model, load_model
import keras.backend as K
from  keras import Sequential
from keras.layers.pooling import GlobalAveragePooling1D
from keras.preprocessing.text import one_hot


# Import some basic packages
import numpy as np
import os
import json

In [5]:
# input files
# [ train_file ] is a training dataset that contains 266K samples.
# [ test_file ] is a testing dataset that contains 200K samples. You can test your model based on this file.
# [ predict_file ] is a predicting dataset that contains 201K samples. Each sample in this file does not have occupation labels.

train_file = 'wiki-train.json.gz'
test_file = 'wiki-dev.json.gz'
predict_file = 'wiki-test.json.gz'

# output files
# [ vocab_file ] has a word vocabulary that defines which words participate in this task.
# The default vocabulary is generated by our methods from training dataset,
# but you can create it in a way you like.
# [ model_file ] is used for store your trained model
# [ result_file ] is file that stores your predicted occupations.
# (This is the file you have to submit, once you ran on the test dataset)

vocab_file = 'vocab.txt'
model_file = 'my_model.h5'
result_file = 'result.json'

In [6]:
# Hyper-parameters: You don't have to change these, but you can.
# [ embedding_dimension ] the dimensions of word embeddings
# [ maximal_sentence_length ] the maximum length of each sentence
# [ number_of_labels ] the number of occupations
# [ epochs ] training epochs. Adjust this parameters to avoid overfitting and underfitting.
# [ batch_size ] the number of samples. It determines how many samples would be fed into your model.
# The size of this parameter also depends on how good your hardware is.
# [ theta ] A threshold to determine whether to assign a specific occupation label given a input sample.
# A suitable theta value will help your model

embedding_dimension = 200
maximal_sentence_length = 100
number_of_labels = 20
epochs = 5
batch_size = 32
theta = 0.5

In [20]:
TFHUB_CACHE_DIR = os.path.join(os.curdir, "my_tfhub_cache")
os.environ["TFHUB_CACHE_DIR"] = TFHUB_CACHE_DIR

import tensorflow_hub as hub
import tensorflow as tf



In [40]:
# YOUR CODE GOES HERE

from functools import partial
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

def create_model(vocab):
    '''
    :param vocab: a vocabulary dictionary which looks like {'python':0, 'java':1 ......}
    :return:
    '''
    encoder = keras.models.Sequential([
        keras.layers.Embedding(len(vocab), embedding_dimension, input_length=maximal_sentence_length),
        keras.layers.LSTM(128)
])

    decoder = keras.models.Sequential([
        keras.layers.LSTM(128, return_sequences=True),
        keras.layers.Dense(number_of_labels, activation="sigmoid")
])

    model = keras.models.Sequential([
        encoder,
        decoder
])
    model.compile(loss="binary_crossentropy", optimizer="sgd", metrics=["binary_accuracy"])
    return model

'''
    DefaultConv2D = partial(keras.layers.Conv2D,kernel_size=3, activation='relu', padding="SAME")

    model = Sequential()
    # initialized word embeddings randomly
    
    embedding_matrix = Word2Vec(vocab, vector_size=maximal_sentence_length, window=5, min_count=1, workers=4)
   
    #model.add(Embedding(len(vocab), embedding_dimension, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable = False))
    model.add(Embedding(len(vocab), embedding_dimension, input_length=maximal_sentence_length))

    #model.add(GlobalAveragePooling1D())
    # two layer densely-connected NNs
    # # different parameters of a dense layer can have different performances
    # # here, we let the number of units equal 32, and use a relu activation.
    #model.add(Dense(32, activation='relu'))
    #model.add(DefaultConv2D(filters = 32, kernel_size = 7, input_shape = []))
    #model.add(MaxPooling2D(pool_size=2))
    #model.add(Dropout(0.25))
    #model.add(Flatten())
    
    model.add(keras.layers.GRU(128, return_sequences = True))
    model.add(keras.layers.GRU(128))
    #model.add(keras.layers.GRU(128))

    model.add(Dense(number_of_labels, activation="sigmoid"))
    

    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['binary_accuracy'])

    model = keras.Sequential([
        hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", 
                       dtype="string", input_shape=[100], output_shape=[50]),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dense(number_of_labels, activation="sigmoid")
])
              

              inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)

merged_tensor = concatenate([maxpool_0, maxpool_1], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((2*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
conc = Dense(40)(dropout)
output = Dense(units=6, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01))(conc)
'''

'\n    DefaultConv2D = partial(keras.layers.Conv2D,kernel_size=3, activation=\'relu\', padding="SAME")\n\n    model = Sequential()\n    # initialized word embeddings randomly\n    \n    embedding_matrix = Word2Vec(vocab, vector_size=maximal_sentence_length, window=5, min_count=1, workers=4)\n   \n    #model.add(Embedding(len(vocab), embedding_dimension, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable = False))\n    model.add(Embedding(len(vocab), embedding_dimension, input_length=maximal_sentence_length))\n\n    #model.add(GlobalAveragePooling1D())\n    # two layer densely-connected NNs\n    # # different parameters of a dense layer can have different performances\n    # # here, we let the number of units equal 32, and use a relu activation.\n    #model.add(Dense(32, activation=\'relu\'))\n    #model.add(DefaultConv2D(filters = 32, kernel_size = 7, input_shape = []))\n    #model.add(MaxPooling2D(pool_size=2))\n    #model.add(Dropout(0.25))\n    #model.ad

In [41]:
def train(debug):
    '''
    train your model.
    :param debug:whether to use a small fraction of samples
    :return:
    '''

    # prepare data
    vocab_to_id = load_vocabulary(vocab_file)
    data_x, data_y = read_dataset(train_file, vocab_to_id, maximal_sentence_length, debug=debug)

    # create a model
    model = create_model(vocab_to_id)
    model.summary()

    # train
    print('start to train, data size = {a}'.format(a=len(data_x)))
    model.fit(data_x, data_y, validation_split=0.10, epochs=epochs, batch_size=batch_size)

    # save model
    model.save(model_file)


def evaluate_on_dev(debug):
    '''
    evaluare your model on the development dataset.

    :param debug:whether to use a small fraction of samples
    :return:
    '''

    # prepare data
    vocab_to_id = load_vocabulary(vocab_file)
    data_x, data_y = read_dataset(test_file, vocab_to_id, maximal_sentence_length, debug=debug)
    raw_samples = list(load_data(test_file))
    print('start to do validation, data size = {a}'.format(a=len(data_x)))
    _, id_to_labels = get_label()
    pred_labels, true_labels = list(), list()

    # load model
    model = load_model(model_file)

    # predict each sample
    for summary, label, raw in zip(data_x, data_y, raw_samples):
        result = model.predict(np.array([summary]))[0]
        pred = set([id_to_labels[i] for i, prob in enumerate(result) if prob > theta])
        true = set([id_to_labels[index] for index, e in enumerate(label) if e == 1])
        pred_labels.append(pred)
        true_labels.append(true)

        # print wrong prediction
        print('Title:' + raw.title)
        wrong_occupations = pred - true
        if len(wrong_occupations) > 0:
            print('[ wrong prediction ] this person does not have the occupations:{a}'.format(a=wrong_occupations))
        missing_occupations = true - pred
        if len(missing_occupations) > 0:
            print('[ missing prediction ] your prediction miss the occupations:{b}'.format(b=missing_occupations))
        print('---------------------------')

    # calculate metrics
    f1, precision, recall = f1_score(true_labels, pred_labels)
    print('result on validation set, f1 : {a}, precision : {b}, recall : {c}.'.
          format(a=f1, b=precision, c=recall))


def predict_on_test(debug):
    '''
    :param debug: whether to use a small fraction of samples
    :return:
    '''

    # prepare data
    _, id_to_labels = get_label()
    vocab_to_id = load_vocabulary(vocab_file)
    model = load_model(model_file)
    datax, _, = read_dataset(predict_file, vocab_to_id, maximal_sentence_length, debug)
    raw_samples = list(load_data(predict_file))

    # predict
    r_f = open(result_file, 'w', encoding='utf8')
    for data, raw_sample in tqdm(zip(datax, raw_samples)):
        result = model.predict(np.array([data]))[0]
        pred = [id_to_labels[i] for i, prob in enumerate(result) if prob > theta]
        r_f.write(json.dumps({
                            'title': raw_sample.title,
                            'occupations': pred
                        }) + "\n")

In [42]:
# create a vocabulary file if does not exist
if not os.path.exists(vocab_file):
    gen_vocabulary(train_file, vocab_file)

# train & evaluate & predict
# note: the switch 'debug' is True means only using a small fraction of samples, which can save time to debug your code.
# Change 'debug' to False when your want to train and test on all samples.
debug = True
train(debug=debug)
evaluate_on_dev(debug=debug)
predict_on_test(debug=debug)

100it [00:00, 1675.82it/s]


ValueError: Input 0 of layer sequential_7 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)