In [None]:
import os
import ast 
import math
import time
import torch
import logging
import smart_open

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.utils import tokenize
from gensim.models.fasttext import FastText 

import warnings
warnings.simplefilter('ignore')

In [None]:
from scripts.utils.hierarchy import *
from scripts.utils.processing import *
from scripts.utils.data_reading import *

logging.basicConfig(level=logging.INFO )

In [None]:
o = OmniscienceReader("OmniScience/original/ArXiv_BMED_Evise_title_abstract_os.2018-07-11.tsv")

In [None]:
temp = o.om_df

In [None]:
temp.shape

In [None]:
temp = temp.dropna()

In [None]:
temp.shape

In [None]:
temp["category"] = temp["file_id"].apply(lambda x: x.split(":")[0])

In [None]:
grouped = temp.groupby('category')

In [None]:
grouped.groups.keys()

In [None]:
arxiv = grouped.get_group('ArXiv')
esvii = grouped.get_group('EVISE.PII')
bmed = grouped.get_group('BMED.PUI')

In [None]:
print(esvii.shape)
print(arxiv.shape)
print(bmed.shape)

In [None]:
esvii["omniscience_label_ids"] = esvii["omniscience_label_ids"].apply(lambda x: list(map(int, x[0])))

In [None]:
def get_text(df):
    
    name = df.iloc[0]["category"]
    fname = "{}_raw.txt".format(name)
    
    file = open(fname, "wb+")

    for i in tqdm(df.index):
        str_each_line = df.at[i, "abstract"] + '\n'
        file.write(str_each_line.encode('utf-8'))

    file.close()
    
    return  name, fname

In [None]:
class CorpusIter(object):
    def __init__(self, file_path):
        super(CorpusIter, self).__init__()
        self.file_path = file_path

    def __iter__(self):
        with smart_open.smart_open(self.file_path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))

In [None]:
def fasttext_generator(fname, name):

    filename = "Omniscience/Fasttext/{}.model".format(name)

    if not os.path.isfile(filename):
        model = FastText(size=128, window=3, min_count=1) # hs=0, negative=0
        model.build_vocab(sentences=CorpusIter(fname))
        total_examples = model.corpus_count
        model.train(sentences=CorpusIter(fname), total_examples=total_examples, epochs=5)
        model.save(filename)
    else:
        model = FastText.load(filename)
    return model

In [None]:
esvii["vec"] = 0

In [None]:
e_name, e_fname = get_text(esvii)

In [None]:
e_model = fasttext_generator(e_fname, e_name)

In [None]:
for i, sent in enumerate(tqdm(CorpusIter(e_fname))):
    temp = 0.0
    slen = len(sent)
    for w in sent:
        temp += e_model.wv[w]
    temp /= slen
    esvii.iloc[i]["vec"] = temp