In [1]:
import connection
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import re
from collections import Counter
import statistics as st
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import snowball
from nltk.corpus import stopwords
from nltk.corpus import wordnet,brown,alpino 
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
def pos_tagger(nltk_tag_lst):
    nltk_tag = nltk_tag_lst[0][1]
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.ADJ_SAT

In [4]:
class LSTokenizer:
    def __init__(self):
        #ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`',')']
        self.wnl = WordNetLemmatizer()
        self.snb = snowball.SnowballStemmer(language='english')
        self.stpw = stopwords.words('english') #+ ignore_tokens
    def __call__(self, doc):
        return [self.wnl.lemmatize(t,pos=pos_tagger(nltk.pos_tag([t]))) for t in word_tokenize(doc) if t not in self.stpw]

In [5]:
def text_prepros(doc_str):   
    try:
        doc_str = re.sub(r"([0-9]+\.{0,1}[0-9]{0,})",r"",doc_str)
        doc_str = re.sub(r"([\.\[\&\,\/\\\]\[\-\^\+\`\$\%\!\@\#\>\<\?\;\:\{\}\=\_\'\|\]\*\)\(])",r"",doc_str)
        doc_str = doc_str.lower()
    except Exception as e:
        print(e)
    return doc_str

In [6]:
def get_keywords_str(doc_str):
    
    #doc_str = text_prepros(doc_str)
    tokenizer = LSTokenizer()
    vectorizer = TfidfVectorizer(ngram_range=(2,2),tokenizer=tokenizer,preprocessor=text_prepros)
    matrix_ = vectorizer.fit_transform([doc_str])
    doc_words = vectorizer.get_feature_names_out()
    return doc_words

In [33]:
def split_into_chunks(doc_str):
    
    input_ = ""
    final_list = []
    list_doc_str = doc_str.split('. ')
    list_lengths = [len(item) for item in list_doc_str]
    for i in range(0,len(list_lengths)):
        if(i==0):
            if(list_lengths[i]<512):
                input_+= list_doc_str[i]
            else:
                final_list.append(input_)
                if(len(list_lengths)>i+1):
                    input_= list_doc_str[i+1]
        else:
            if(len(input_)<512):
                if(len(input_)+list_lengths[i]<=512):
                    input_ += list_doc_str[i]
                    if(i==len(list_lengths)-1):
                        final_list.append(input_)
                else:
                    final_list.append(input_)
                    input_= list_doc_str[i]
            else:
                final_list.append(input_)
                input_= list_doc_str[i]
    return final_list

In [8]:
model = SentenceTransformer('all-distilroberta-v1')

In [10]:
def get_bestdesc_words(doc_str,most_n):
    
    all_words = get_keywords_str(doc_str)
    candidate_embeddings = model.encode(all_words)
    doc_embedding = model.encode([doc_str])
    
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    sec_keywords_ = [all_words[index] for index in distances.argsort()[0][-most_n:]]
    
    return sec_keywords_

In [19]:
def get_ai_words(doc_str,prim_keys,most_n):
    
    all_words = get_keywords_str(doc_str)
    candidate_embeddings = model.encode(all_words)
    prime_embeddings = model.encode(prim_keys)

    distances = cosine_similarity(prime_embeddings, candidate_embeddings)
    ai_indx_lst = Counter([i[0] for i in distances.argsort()[:,-1:]]).most_common(most_n)
    ai_keywords_ = [all_words[tp[0]] for tp in ai_indx_lst]
    
    return ai_keywords_

In [None]:
def create_keynodes(tx,p_id,keyword,flag):
    result = tx.run("""MATCH (p:Publication {id:$p_id}) 
    MERGE (k:keyword {name:$keyword,type:$flag})
    MERGE (p)<-[r:FOUND_IN]-(k)
    """,p_id=p_id,keyword=keyword,flag=flag)
    summary = result.consume()
    return summary

In [24]:
def set_pub_flag(tx,p_id):
    tx.run("""MATCH (p:Publication {id:$p_id}) set p.flag="False" """,p_id=p_id)

In [35]:
if __name__ == '__main__':
    session = connection.est_connection()
    data = session.execute_read(connection.get_pub_summary)
    primary_keys = session.execute_read(connection.get_keyword_nodes,'primary',0)
    for i in range(0,len(data)):
        p_id = data.iloc[i][0]
        summary = data.iloc[i][1]
        best_words = []
        for chunk in split_into_chunks(summary):
            if(chunk!=''):
                best_words += get_bestdesc_words(chunk,1)
        if(best_words!=[]):
            for key in set(best_words):
                session.execute_write(connection.create_keynodes,p_id,key.lower(),'secondary')
        for key in get_ai_words(summary,primary_keys,1):
            session.execute_write(connection.create_keynodes,p_id,key.lower(),'primary')
        session.execute_write(set_pub_flag,p_id)
        if(i==len(data)/2):
            print("half way")

KeyboardInterrupt: 

In [36]:
session.close()