In [1]:
from gensim.models import Word2Vec
import logging
import json
from collections import Counter
import snowballstemmer, nltk
from nltk.corpus import stopwords
from string import ascii_lowercase

logging.basicConfig(format = '%(asctime)s %(message)s')

In [2]:
def word2vec_training(tokens, out_path):
    model = Word2Vec(workers=4, size=300)

    logging.warning('corpus has been loaded')


    token_num = sum(1 for token in tokens)
    a = "total_token = " + str(token_num)
    logging.warning(a)

    logging.warning('build vocab start')
    model.build_vocab(tokens)

    logging.warning('training model start')

    model.train(tokens, total_examples=token_num, epochs=model.iter)

    logging.warning('training model has been finished')
    model.wv.save_word2vec_format(out_path, binary = True)

In [4]:
with open("data/data.json",'r') as f:
    data = json.load(f)

In [5]:
sentences = []
for doc in data.keys():
    sentences += data[doc]['title']
    sentences += data[doc]['abstract']
    sentences += data[doc]['introduction']

In [6]:
sentences = [sentence.lower() for sentence in sentences]

In [7]:
stemmer = snowballstemmer.EnglishStemmer()

stop = stopwords.words('english')
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist))) 
print(stop)

{'am', 'among', 'l', 'these', 'this', 'below', 'one', 'further', 'll', 'an', 'yours', 'beside', 'theirs', 'whi', 'f', 'after', 'ma', 'above', 'what', 'than', 'here', 'to', 'wasn', 'y', 'four', 'hasn', 'however', 'in', 'c', 'for', 'too', 'herself', 'very', 'shan', 'so', 'don', 'their', 'u', 'doing', 'with', 'j', 'was', 'd', 'within', 'about', 'had', 'ani', 'needn', 'which', 'z', 'did', 'nor', 'seven', 'now', 'abov', 'haven', 'should', 'whom', 'from', 'under', 'onli', 'only', 'once', 'over', 'until', 'up', 'yourselves', 'while', 'our', 'k', 'six', 'just', 'becaus', 'he', 'they', 'yourself', 'because', 'who', 'ten', 'doesn', 'hers', 'then', 'o', 'ain', 'when', 'yourselv', 'between', 'does', 'your', 'or', 'some', 'most', 'won', 'been', 'mightn', 'into', 'doe', 'the', 'is', 'through', 'that', 'a', 'onc', 'but', 'there', 'all', 'hadn', 'of', 'on', 'its', 'befor', 'by', 'wouldn', 'e', 'same', 'her', 'against', 'own', 'him', 'itself', 'them', 'out', 'veri', 'as', 'aren', 'nine', 'it', 'will', 

In [8]:
sentences = [sentence.split() for sentence in sentences]

In [9]:
len(sentences)

6134

In [None]:
sum(len(setence) for setence in sentences)/len(sentences)

In [None]:
max([len(setence) for setence in sentences])

In [None]:
for sentence in sentences:
    if len(sentence) > 200:
        print(' '.join(sentence) + '\n')

In [10]:
pre_process_sentences = []
for tokens in sentences:
    tokens = [token.lower() for token in tokens if token.lower() not in stop]
    #remove all stop words
    tokens = [token for token in tokens if not token.isdigit()]
    #remove all numbers
    pre_process_sentences.append(tokens)
pre_process_sentences[0]

['multilingual', 'coreference', 'resolution']

In [11]:
full_token = []
for sentence in pre_process_sentences:
    full_token += sentence

t_counter = Counter(full_token)
final_setences = []

for tokens in pre_process_sentences:
    tokens = [token for token in tokens if t_counter[token] > 5]
    final_setences.append(tokens)


In [15]:
word2vec_training(tokens = final_setences, out_path = "only_paper_corpus.bin")

2017-11-14 21:26:29,404 corpus has been loaded
2017-11-14 21:26:29,405 total_token = 6134
2017-11-14 21:26:29,405 build vocab start
2017-11-14 21:26:29,484 training model start
2017-11-14 21:26:29,788 under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
2017-11-14 21:26:29,789 training model has been finished


In [16]:
intersect_model = Word2Vec(workers=4, size=300)

logging.warning('corpus has been loaded')
token_num = sum(1 for token in final_setences)
a = "total_token = " + str(token_num)
logging.warning(a)
logging.warning('build vocab start')
intersect_model.build_vocab(final_setences)

2017-11-14 21:32:05,704 corpus has been loaded
2017-11-14 21:32:05,705 total_token = 6134
2017-11-14 21:32:05,706 build vocab start


In [17]:
from gensim.scripts import glove2word2vec

glove2word2vec.glove2word2vec('glove.840B.300d.txt','glove.840B.300d.txt.word2vec')

(2196017, 300)

In [18]:
intersect_model.intersect_word2vec_format(fname="glove.840B.300d.txt.word2vec",binary=False)

In [19]:
intersect_model.train(final_setences, total_examples=token_num, epochs=intersect_model.iter)

2017-11-14 21:41:26,188 under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


319503

In [20]:
intersect_model.wv.save_word2vec_format("intersect_model_glove_840B.bin", binary = True)

In [22]:
for key in data.keys():
    print(key)
    break

A00-1020


In [28]:
from rouge import Rouge 

doc = data[key]
for line in doc['introduction']:
    
    intro_tokens = [token.lower() for token in line.split() if token.lower() not in stop]
    intro_tokens = [token for token in intro_tokens if not token.isdigit()]

    abstract = ' '.join([i for i in doc['abstract']])
    abst_tokens = [token.lower() for token in abstract.split() if token.lower() not in stop]
    abst_tokens = [token for token in abst_tokens if not token.isdigit()]
    
    ref = ' '.join(abst_tokens)
    hyp = ' '.join(intro_tokens)
    rouge = Rouge()
    scores = rouge.get_scores(hyps=hyp, refs=ref)
    label.append(scores)

In [42]:
label[0][0]['rouge-1']['f']

0.05405404967129327

In [48]:
doc = data[key]
new_doc = dict()
new_doc['introduction'] = []
new_doc['introduction_label'] = []
new_doc['abstract'] = []

ref = ''
for abst_line in doc['abstract']:
    
    abst_tokens = [token.lower() for token in abst_line.split() if token.lower() not in stop]
    abst_tokens = [token for token in abst_tokens if not token.isdigit()]
    
    new_doc['abstract'].append(' '.join(abst_tokens))
    ref += ' '.join(tokens)
    
for line in doc['introduction']: 
    
    intro_tokens = [token.lower() for token in line.split() if token.lower() not in stop]
    intro_tokens = [token for token in intro_tokens if not token.isdigit()]
    
    new_doc['introduction'].append(hyp)
    hyp = ' '.join(intro_tokens)
    
    rouge = Rouge()
    scores = rouge.get_scores(hyps=hyp, refs=ref)
    del scores[0]['rouge-2']
    new_doc['introduction_label'].append(scores[0])

In [50]:
new_data = dict()
for key in data:
    new_data[key] = dict()
    doc = data[key]
    new_data[key]['introduction'] = []
    new_data[key]['introduction_label'] = []
    new_data[key]['abstract'] = []

    ref = ''
    for abst_line in doc['abstract']:

        abst_tokens = [token.lower() for token in abst_line.split() if token.lower() not in stop]
        abst_tokens = [token for token in abst_tokens if not token.isdigit()]

        new_data[key]['abstract'].append(' '.join(abst_tokens))
        ref += ' '.join(tokens)

    for line in doc['introduction']: 

        intro_tokens = [token.lower() for token in line.split() if token.lower() not in stop]
        intro_tokens = [token for token in intro_tokens if not token.isdigit()]

        new_data[key]['introduction'].append(hyp)
        hyp = ' '.join(intro_tokens)

        rouge = Rouge()
        scores = rouge.get_scores(hyps=hyp, refs=ref)
        del scores[0]['rouge-2']
        new_data[key]['introduction_label'].append(scores[0])

In [52]:
with open('data/pre_process_labeled_data.json', 'w') as fp:
    json.dump(new_data, fp)