In [1]:
# !pip install langdetect
import string
import pandas as pd
from langdetect import detect

# !pip install -U gensim --user
import gensim
import argparse
import numpy as np
import random, time
import gzip, os, csv

unable to import 'smart_open.gcs', disabling that module


In [2]:
### check if there is any missing value in the dataset ###
def check_missing(df, col):
    missing  = 0
    misVariables = []
    CheckNull = df.isnull().sum()
    for var in range(0, len(CheckNull)):
#         if CheckNull[var] != 0:
        misVariables.append([col[var], CheckNull[var], round(CheckNull[var]/len(df),3)])
        missing = missing + 1

    if missing == 0:
        print('Dataset is complete with no blanks.')
    else:
        print('Totally, %d features have missing values (blanks).' %missing)
        df_misVariables = pd.DataFrame.from_records(misVariables)
        df_misVariables.columns = ['Variable', 'Missing', 'Percentage (%)']
        s = df_misVariables.sort_values(by=['Percentage (%)'], ascending=False).style.bar(subset=['Percentage (%)'], color='#d65f5f')
        display(s)

In [4]:
def paper_info_csv(df_pub):
    
    ## Extract features we need from the original dataset ### 
    shortInfo_pub_df = df_pub[['Title', 'Source title', 'Publisher', 'Abstract','DOI', \
                               'Publication Type', 'Dimensions URL', 'Publication Date']]
#     shortInfo_pub_df['DOI'] = 'https://doi.org/' + df_pub['DOI']

    ### Check the paper langauge ###
    lang_paper = []
    for i in shortInfo_pub_df['Title']:
        try:
            lang_paper.append(detect(i))
        except:
            lang_paper.append(None)

    shortInfo_pub_df['Language'] = lang_paper
    
    ### standardize the name of source title and publisher ###
    new_source_title = []
    new_publisher = []

    for item in pub_df_2['Source title']:
        try:
            new_source_title.append(item.translate(str.maketrans('', '', string.punctuation)).replace(' ','_').lower())
        except:
            new_source_title.append(None)
    for item in pub_df_2['Publisher']:
        try:
            new_publisher.append(item.translate(str.maketrans('', '', string.punctuation)).replace(' ','_').lower())
        except:
            new_publisher.append(None)

    ### Save to a new data file ###
    shortInfo_pub_df.to_csv('shortInfoPub.csv', index=None)

    return shortInfo_pub_df

***

In [5]:
def author_paper_csv(df_pub):

    ### Get authors list ###
    author_list = []
    unique_author_list = []

    for author in df_pub['Authors']:
        if type(author) == str:
            author = author.replace(' ','')
            splited_author = author.split(';')
            author_list.append(splited_author)
            for item in splited_author:
                if item not in unique_author_list:
                    unique_author_list.append(item)
        else:
            author_list.append([])


    ### Remove unmeaningful author name ###
    remove_name = [',', 'UN,']
    for each_name in remove_name:
        unique_author_list.remove(each_name)


    ### Create Author-paper list ###
    papers_each_author = []
    for unique_author in unique_author_list:
        for each_paper in range(0, len(author_list)):
            if unique_author in author_list[each_paper]:
                papers_each_author.append([unique_author, df_pub['Dimensions URL'][each_paper]])

    papers_each_author_df = pd.DataFrame.from_records(papers_each_author)
    papers_each_author_df.columns = ['Author', 'Dimensions URL']
    
    remove_punc_author = []
    for item in au_pub['Author']:
        remove_punc_author.append(item.translate(str.maketrans('', '', string.punctuation)).replace('ʼ',''))
        
    papers_each_author_df['Author_nopunc'] = remove_punc_author
    papers_each_author_df.to_csv('AuthorsPub.csv', index=None)
    
    return papers_each_author_df

************

### Extract key words from title ###

In [6]:
# !pip install --user -U nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [7]:
### https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [8]:
# POS tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_extracted_keywords_from_title(shortInfo_pub_df):
    
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(['e.g', '’'])

    title_filtered_sentence=[]

    for item in range(0, len(shortInfo_pub_df['Title'])):
        if shortInfo_pub_df['Language'][item] == 'en':
            lemmatized_title = lemmatize_sentence(shortInfo_pub_df['Title'][item].lower())
            tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+\$[\d\.]+|\S+')
            tokens = tokenizer.tokenize(lemmatized_title)

            tagged = nltk.pos_tag(tokens)

            each_title = []
            for tag in tagged:
                if tag[0] not in stop_words and tag[0] not in each_title:
                    if ('NN' in tag[1]) or ('VB' in tag[1]) or ('JJ' in tag[1]):
                        if (tag[0] not in string.punctuation) and (not tag[0].isdigit()) and (len(tag[0])>2):
                            each_title.append(tag[0])
            title_filtered_sentence.append(each_title)
        else:
            title_filtered_sentence.append([])
    
    
    keywords_title_paper = []
    for each_paper in range(0, len(title_filtered_sentence)):
        for each_word in title_filtered_sentence[each_paper]:
            keywords_title_paper.append([shortInfo_pub_df['Dimensions URL'][each_paper], each_word])
            
    return title_filtered_sentence, keywords_title_paper

In [9]:
### Use Gensim to find similar keywords ###
def similar_keywords(title_filtered_sentence):
    
    model = gensim.models.Word2Vec(min_count=2, size=700, workers=5)
    model.build_vocab(title_filtered_sentence)

    corpus_count = model.corpus_count
    model.train(title_filtered_sentence, total_examples = corpus_count, epochs = 1000)
    
    similar_keywords_list = []
    for each in range(0, len(keywords_df)):
        keyword = keywords_df['Keyword'][each]
        try:
            similar_keywords = model.wv.most_similar(keyword, topn=5)
        except:
            similar_keywords = []
        for item in similar_keywords:   
            similar_keywords_list.append([keywords_df['Dimensions URL'][each], keyword, item[0]])

    return similar_keywords_list

In [196]:
def main():
    
    ### Read dataset from Dimensions ###
    df_pub = pd.read_csv('COVID19Publications.csv', sep=';')
    col = df_pub.columns
    
    ## Check missing values in the Dimensions dataset ###
#     check_missing(df_pub, col)
    
    ## Generate a new data file with key features ###
    shortInfo_pub_df = paper_info_csv(df_pub)

    ## Generate a author-paper data file ###
    papers_each_author_df = author_paper_csv

    ## Generate a paper-keywords (from title) data file ###
    title_filtered_sentence, keywords_title_paper = get_extracted_keywords_from_title(shortInfo_pub_df)
            
    keywords_df = pd.DataFrame.from_records(keywords_title_paper)
    keywords_df.columns = ['Dimensions URL', 'Keyword']
    keywords_df.to_csv('keywordsPub.csv', index=None)

    ## Get similar keywords and generate new keywords file ###
#     similar_keywords_list = similar_keywords(title_filtered_sentence)

#     similar_keywords_df = pd.DataFrame.from_records(similar_keywords_list)
#     similar_keywords_df.columns = ['Dimensions URL', 'Keyword', 'Similar_Keyword']
#     similar_keywords_df.to_csv('similarkeywordsPub.csv',index=None)

if __name__ == "__main__":
    main()

In [10]:
shortInfo_pub_df = pd.read_csv('extracted_datafiles/shortInfoPub.csv')

In [11]:
title_filtered_sentence, keywords_title_paper = get_extracted_keywords_from_title(shortInfo_pub_df)

In [17]:
for item in title_filtered_sentence:
    for i in item:
        if len(i) == 3:
            print(i)

use
new
era
use
old
new
hiv
pcr
era
new
use
dsm
due
igm
new
rna
gap
n95
law
use
ask
sex
age
use
bcg
sga
air
con
vsv
use
use
rho
air
old
new
gap
rmb
use
aim
bug
era
use
use
old
ibd
ill
age
era
due
co2
emg
bag
sea
use
set
use
war
run
ill
use
era
rna
dr.
due
no2
new
ada
cdc
use
icu
day
usa
ark
nhs
pcr
ren
new
age
log
age
rna
tax
act
era
cdc
age
qtc
lie
-19
use
irs
map
new
art
use
old
tip
aid
use
use
get
pet
low
tip
age
ill
era
nhs
nhs
old
get
die
way
end
hcq
use
end
age
alp
ace
arb
dna
ras
use
due
day
key
u.s
del
bcg
map
day
use
dna
ras
mdt
use
gdp
key
map
use
new
use
use
law
use
use
eat
bat
web
use
old
gdc
art
anz
use
rat
map
use
use
new
set
gym
tip
new
new
use
pda
iii
san
bad
pci
use
apa
apa
g20
nas
fda
use
use
cdc
due
hpv
old
ill
eye
uic
tie
bda
new
jaw
era
end
use
icu
use
law
due
run
rbd
war
ocd
air
use
fix
use
era
mmt
hiv
new
use
pay
era
era
ill
new
die
air
bed
pcr
mix
icu
cat
dog
age
use
niv
day
bcg
new
top
al.
top
rsv
aim
ros
bat
niv
new
air
get
art
ill
icu
era
old
low
sud
cut
fit


***

# RDF2Vec training

In [2]:
## create data structure for knowledge graph
def addTriple(net, source, target, edge):
    if source in net:
        if  target in net[source]:
            net[source][target].add(edge)
        else:
            net[source][target]= set([edge])
    else:
        net[source]={}
        net[source][target] =set([edge])
            
def getLinks(net, source):
    if source not in net:
        return {}
    return net[source]

# Generate paths (entity->relation->entity) by radom walks
def randomWalkUniform(triples, startNode, max_depth=5):
    next_node =startNode
    path = str(startNode)+'->'
    for i in range(max_depth):
        neighs = getLinks(triples,next_node)
        #print (neighs)
        if len(neighs) == 0: break
        weights = []
        queue = []
        for neigh in neighs:
            for edge in neighs[neigh]:
                queue.append((edge,neigh))
        edge, next_node = random.choice(queue)
        path = path +str(edge)+'->'
        path = path +str(next_node)+'->'
    path =path.split('->')
    return path

In [3]:
# Build the knowledge graph structure
def preprocess(fname):
    triples = {}

    ent_counter = 0
    rel_counter = 0
    train_counter = 0

    print (fname)
    #gzfile= gzip.open(fname, mode='rt')

    for line in csv.reader(open(fname), delimiter='\t', quotechar='"'):
        #print (line)
        h = line[0]
        r = line[1]
        t = line[2]
        
        train_counter +=1

        addTriple(triples, h, t, r)
        train_counter+=1
    print ('Triple:',train_counter)
    return triples

In [4]:
file = 'query-result.tsv'
triples = preprocess(file)

entities = list(triples.keys())
vocabulary = entities
print (len(vocabulary))

query-result.tsv
Triple: 342982
12144


# Do random walks on the knowledge graph

In [5]:
def randomNWalkUniform(triples, n, walks, path_depth):
    path=[]
    for k in range(walks):
        walk = randomWalkUniform(triples, n, path_depth)
        path.append(walk)
    return path

In [6]:
walks = 100
path_depth = 10

start_time =time.time()
sentences =[]
for word in vocabulary:
    sentences.extend( randomNWalkUniform(triples, word, walks, path_depth) )
elapsed_time = time.time() - start_time
print ('Time elapsed to generate features:',time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

Time elapsed to generate features: 00:00:13


In [7]:
model1 = gensim.models.Word2Vec(size=300, workers=5, window=5, sg=1)
model1.build_vocab(sentences)

In [9]:
corpus_count = model1.corpus_count
model1.train(sentences, total_examples = corpus_count, epochs = 100)

(252631606, 488685800)

In [16]:
# model1.wv.most_similar('<https://app.dimensions.ai/details/publication/pub.1126620775>')