In [40]:
from bson.objectid import ObjectId
from pymongo import MongoClient
import numpy as np
from numpy import dot
from numpy.linalg import norm
import jaro
from sklearn.feature_extraction.text import TfidfVectorizer

def cos_sim(A):
  return dot(A[0], A[1])/(norm(A[0])*norm(A[1]))

def test_filter(name, site1, raw_one1, site2, raw_one2):

    if site1 != 'NTIS' and site2 != 'NTIS':
        author_inst1 = raw_one1['author_inst'].split(";")[raw_one1['author'].split(";").index(name)]
        author_inst2 = raw_one2['author_inst'].split(";")[raw_one2['author'].split(";").index(name)]
        coauthor1 = raw_one1['author'].split(";")[1:-1]
        coauthor2 = raw_one2['author'].split(";")[1:-1]

        inst = jaro.jaro_winkler_metric(author_inst1, author_inst2)
        weight = 0

        if raw_one1['title'] == raw_one2['title'] or coauthor1[-1] in coauthor2 or coauthor2[-1] in coauthor1: or inst >= 0.8:
            weight = 6 #6
        else:
            journal1 = raw_one1['journal']
            conference1 = raw_one1['issue_inst']
            issYear1 = int(raw_one1['issue_year'][:4])
            
            keyword1 = raw_one1['paper_keyword'].replace(" ", "").split(".")

            journal2 = raw_one2['journal']
            conference2 = raw_one2['issue_inst']
            issYear2 = int(raw_one2['issue_year'][:4])
            
            keyword2 = raw_one2['paper_keyword'].replace(" ", "").split(".")

            joc = 1 if journal1 == journal2 and conference1 == conference2 else -1
            yop = -(2*(abs(issYear1-issYear2)/10)-1)
            co_authorship = 1 - np.exp(-len([i for i in coauthor1 if i in coauthor2]))
            keyword = 1 - np.exp(-len([i for i in keyword1 if i in keyword2]))

            title1 = raw_one1['title']
            title2 = raw_one2['title']

            title_corpus = [title1, title2]
            title_tfidfv = TfidfVectorizer().fit(title_corpus)
            title_cossim = cos_sim(title_tfidfv.transform(title_corpus).toarray())

            abstract1 = raw_one1['abstract']
            abstract2 = raw_one2['abstract']
            
            if abstract1 == 'null' or abstract2 == 'null':
                abstract_cossim = 0
            else:
                abstract_corpus = [abstract1, abstract2]
                abstract_tfidfv = TfidfVectorizer().fit(abstract_corpus)
                abstract_cossim = cos_sim(abstract_tfidfv.transform(abstract_corpus).toarray())

            print(f'joc: {joc} | yop: {yop} | co_authorship: {co_authorship} | keyword: {keyword} | title: {title_cossim} | abstract: {abstract_cossim}')

            weight += joc + yop + co_authorship + keyword + title_cossim + abstract_cossim
    else:
        weight = 0
        print('NTIS는 개발중')

    return weight

In [41]:
client = MongoClient('mongodb://203.255.92.141:27017', authSource='admin')

name = '최도진'
site1 = 'Scienceon'
site2 = 'Scienceon'
input_paper1 = ObjectId('61939443e9b04a9d64abe055')
input_paper2 = ObjectId('61939447e9b04a9d64abe06a')

if site1 == 'Scienceon' :
    site_raw1 = client['SCIENCEON']['Rawdata']
elif site1 == 'NTIS' :
    site_raw1 = client['NTIS']['Rawdata']
elif site1 == 'DBPIA' :
    site_raw1 = client['DBPIA']['Rawdata']

if site2 == 'Scienceon' :
    site_raw2 = client['SCIENCEON']['Rawdata']
elif site2 == 'NTIS' :
    site_raw2 = client['NTIS']['Rawdata']
elif site2 == 'DBPIA' :
    site_raw2 = client['DBPIA']['Rawdata']

input_raw1 = site_raw1.find_one({ '_id' : input_paper1 })
input_raw2 = site_raw2.find_one({ '_id' : input_paper2 })

test_filter(name, site1, input_raw1, site2, input_raw2)


6