# Import Packages

In [None]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import csv

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    model['<pad>'] = np.zeros([300,])
    model['<unknown>'] = np.zeros([300,])
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

glove = loadGloveModel('../data/glove.840B.300d.txt')

myFile = open('../data/vector_glove.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(glove)
     
print("Writing complete")

def gen_wordidx(corpus):
    word_idx = {}
    index = 0
    for value, key in enumerate(corpus):
        word_idx[key] = index
        index += 1
    return word_idx
pretrain_idx = gen_wordidx(glove)

dat = pd.read_csv('../data/pubmed.csv')
rm_labels = pd.read_csv('../data/rm_labels.csv', index_col=0,header = None, names=['Density'])

rm_list = rm_labels.index.values

def cleansing(text):
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'[^\w\s]',' ',text)
    text = text.strip(' ')
    return text

def pad_sent(data):
    max_len = np.max([len(x) for x in data]) 
    padded_data = []
    for ab in data:
        pads = ['<pad>'] * (max_len - len(ab))
        padded_data.append(ab + pads)
    return padded_data

def con_idx(data, idx):
    dat_idx = []
    for ab in data:
        temp_idx = []
        for tok in ab:
            if tok in idx:
                temp_idx.append(idx[tok])
            else:
                temp_idx.append(idx['<unknown>'])
        dat_idx.append(temp_idx)
    return dat_idx

dat['tokenize'] = dat['abstract']
dat['tokenize'] = dat['tokenize'].fillna('None')
dat['tokenize'] = dat['tokenize'].str.lower()
dat['tokenize'] = dat['tokenize'].apply(cleansing)
dat['tokenize'] = dat['tokenize'].str.split()

x = list(dat['sequence'])

yy = []
for i in range(len(x)):
    temp = x[i].replace("[","").replace("]","").replace("(","").replace(")","").replace("'","").split(", ")
    y = []
    for j in range(len(temp)):
        if j%2 != 0:
            y.append(temp[j])
    yy.append(y)
    
dat['mesh term code'] = yy

temp = dat[['tokenize','mesh term code']].values

rm_flag = np.zeros([np.shape(temp)[0],])
for j in range(np.shape(temp)[0]):
    for i in range(len(temp[j][1])):
        if temp[j][1][i] in rm_list:
            rm_flag[j] = 1
            
dat['rm_flag'] = rm_flag
dat1 = dat[dat['rm_flag'] == 0]
dat1 = dat1[['abstract', 'mesh terms', 'sequence', 'tokenize','mesh term code']]
dat_unseen = dat[dat['rm_flag'] == 1]
dat_unseen = dat_unseen[['abstract', 'mesh terms', 'sequence', 'tokenize','mesh term code']]

dat_target = dat1['mesh term code']
dat_target.to_csv('../data/target_prep_g.csv')

temp_dat = dat1['tokenize']
temp_dat = np.array(temp_dat)
temp_dat = pad_sent(temp_dat)
temp_dat = con_idx(temp_dat, pretrain_idx)

myFile = open('../data/input_prep_w2v_g.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(temp_dat)
     
print("Writing complete")

dat_unseen_target = dat_unseen['mesh term code']
dat_unseen_target.to_csv('../data/target_prep_unseen_g.csv')

temp_dat = dat_unseen['tokenize']
temp_dat = np.array(temp_dat)
temp_dat = pad_sent(temp_dat)
temp_dat = con_idx(temp_dat, pretrain_idx)

myFile = open('../data/input_prep_unseen_w2v_g.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(temp_dat)
     
print("Writing complete")