In [1]:
import warnings
warnings.filterwarnings("ignore")

from utils import dataset, load_bow
import pandas as pd
from datetime import datetime
from itertools import chain
import numpy as np
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import time
from names_matcher import NamesMatcher
from tqdm import tqdm
import random
import nest_asyncio
nest_asyncio.apply()

In [2]:
w = 1

In [3]:
col = [
    'Publication Type', \
    'Venue', \
    'Author Full Names', \
    'Article Title', \
    'Abstract', \
    'Publication Date', \
    'Publication Year']

venues = dataset()[col]
venues = venues.dropna(subset=['Abstract'])
venues['Abstract Lemmatized'] = load_bow('25_venues')
venues = venues.dropna(subset=['Publication Date'])
venues['Date'] = (venues['Publication Date']+'-'+venues['Publication Year']).apply(lambda x: datetime.strptime(x, '%b-%Y'))
query_dataframe = pd.read_csv('./database/query_dataframe.csv', index_col=1)
venues['Venue Full Name'] = venues['Venue'].apply(lambda x : query_dataframe.loc[x, 'Nome'])
venues = venues.sort_values(by='Date')
venues.index = range(len(venues))

df = pd.read_csv(f'./KLD/distributions_{w}.csv')
venues['LDA Distribution'] = df['LDA Distribution']

for c in [co for co in df.columns if co[-1:] == str(w)]:
    venues[c] = df[c]

In [4]:
authors = venues['Author Full Names']
authors = [names.split(';') for names in authors]
authors = [[name.strip().upper() for name in names] for names in authors]
authors = list(authors)
authors = list(chain.from_iterable(authors))
authors = sorted(list(set(authors)))

In [5]:
authors_dict = {}

for author in authors:
    authors_dict[author] = []

for index, row in venues.iterrows():
    for author in row['Author Full Names'].split(';'):
        authors_dict[author.upper().strip()].append(row['Article Title'])

In [6]:
authors_df = {k: str(v) for k, v in authors_dict.items()}
authors_df = pd.DataFrame(authors_df, index = ['Papers']).T
authors_df['publications'] = authors_df['Papers'].apply(lambda x: len(eval(x)))

In [11]:
top = authors_df.iloc[np.where(authors_df['publications'] >= 37)].sort_index()
top['pids'] = np.nan
top.to_csv('./database/top1000authors.csv')

In [78]:
def find_author(name, authors):
    return NamesMatcher()([name], authors)[0][0]


def make_request(session, pub, retries=10, backoff_factor=1):
    url = f'https://dblp.org/search/publ/api?q={pub}&format=xml'
    
    for retry in range(retries):
        try:
            time.sleep(backoff_factor)  # Wait for at least a second between requests
            response = session.get(url)
            if response.status_code == 429:  # Too Many Requests
                raise requests.exceptions.RequestException("Too Many Requests")
            return response.text
        except requests.exceptions.RequestException as e:
            if retry < retries - 1:
                sleep_time = backoff_factor * (2 ** retry) + random.uniform(0, 0.1)
                time.sleep(sleep_time)
                continue
            else:
                raise e
from collections import Counter           
async def start_async_process():
    with ThreadPoolExecutor(max_workers=6) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [loop.run_in_executor(executor, make_request, *(session,f'{i} {name}')) for i in pubs]
            pids = []
            for response in await asyncio.gather(*tasks):
                soup = BeautifulSoup(response, 'xml')
                retries = 0
                while (hits := soup.find('hits')) and hits['computed'] == '0' and retries < 10:
                    pub = random.sample(eval(top.loc[name]['Papers']), 1)
                    if retries > 5:
                        req = make_request(session, f'{pub}')
                    else:
                        req = make_request(session, f'{pub} {name}')
                    soup = BeautifulSoup(req, 'xml')
                    retries += 1
                author_pids = list(map(lambda x: x['pid'] ,soup.findAll('author')))
                author_names = list(map(lambda x: x.text ,soup.findAll('author')))
                if len(author_pids) > 0:
                    pids.append(author_pids[find_author(name, author_names)].split('-')[0])
            if pids != []:
                pids_count = Counter(pids)
                top.loc[name, 'pids'] = pids_count.most_common(1)[0][0]
                top.loc[name, 'names'] = str(pids)

In [79]:
top = pd.read_csv('./database/top1000authors.csv')
top.set_index('Unnamed: 0', inplace=True)
missing = top[top['pids'].isna()].index
top['names'] = np.nan

for name in tqdm(missing):
    pubs = random.sample(eval(top.loc[name]['Papers']), 11)
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(start_async_process())
    asyncio.run(future)

still_missing = top[top['pids'].isna()].index
for name in tqdm(still_missing):
    pubs = random.sample(eval(top.loc[name]['Papers']), 11)
    pids = []
    with requests.Session() as session:
        for pub in pubs:
            req = make_request(session, f'{pub}')
            soup = BeautifulSoup(req, 'xml')
            retries = 0
            while (hits := soup.find('hits')) and hits['computed'] == '0' and retries < 10:
                pub = random.sample(eval(top.loc[name]['Papers']), 1)
                if retries > 5:
                    req = make_request(session, f'{pub}')
                else:
                    req = make_request(session, f'{pub} {name}')
                soup = BeautifulSoup(req, 'xml')
                retries += 1
            author_pids = list(map(lambda x: x['pid'] ,soup.findAll('author')))
            author_names = list(map(lambda x: x.text ,soup.findAll('author')))
            if len(author_pids) > 0:
                pids.append(author_pids[find_author(name, author_names)].split('-')[0])

    if pids != []:
        pids_count = Counter(pids)
        top.loc[name, 'pids'] = pids_count.most_common(1)[0][0]
        top.loc[name, 'names'] = str(pids)

top.to_csv('./database/top1000authors.csv')

100%|██████████| 895/895 [6:28:58<00:00, 26.08s/it]   
0it [00:00, ?it/s]


In [62]:
top.loc['LI, LING', 'pids'] = '53/2189'

In [80]:
top[top.duplicated(subset=['pids'], keep=False)].sort_values('pids')

Unnamed: 0_level_0,Papers,publications,pids,names
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"WANG, XIZHAO",['Reduction of attributes in ordinal decision ...,38,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"WANG, XZ",['Neural network based fault diagnosis using u...,68,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"WANG, XI-ZHAO",['A research on weight acquisition of weighted...,47,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"ZHU, SC","['Region competition: Unifying snakes, region ...",37,10/10313,"['10/10313', '10/10313', '10/10313', '10/10313..."
"ZHU, SONG-CHUN",['Compositional boosting for computing hierarc...,127,10/10313,"['10/10313', '10/10313', '10/10313', '10/10313..."
"YEUNG, DANIEL S.",['Sensitivity analysis of Madalines to weight ...,49,36/896,"['36/896', '36/896', '36/896', '36/896', '36/8..."
"YEUNG, DS",['FUZZY KNOWLEDGE REPRESENTATION AND REASONING...,47,36/896,"['36/896', '36/896', '36/896', '36/896', '36/8..."
YOSHUA BENGIO,['A Structured Self-Attentive Sentence Embeddi...,61,56/953,"['56/953', '56/953', '56/953', '56/953', '56/9..."
"BENGIO, YOSHUA",['Exploring Strategies for Training Deep Neura...,123,56/953,"['56/953', '56/953', '56/953', '56/953', '56/9..."
"LEVINE, SERGEY",['Learning Complex Neural Network Policies wit...,64,80/7594,"['80/7594', '80/7594', '80/7594', '80/7594', '..."


In [54]:
i = -1

In [61]:
# top = pd.read_csv('./database/top1000authors.csv').sort_values(by='publications', ascending=False)
# top.set_index('Unnamed: 0', inplace=True)
name = 'LI, LI'
i += 1
pub = eval(top.loc[name]['Papers'])[i]
url = f'https://dblp.org/search/publ/api?q={pub}&format=xml'
request = requests.get(url)

soup = BeautifulSoup(request.text, 'xml')
author_pids = list(map(lambda x: f"{x['pid']} | {x.text}" ,soup.findAll('author')))
#author_pids[find_author(name, author_pids)]
soup

<?xml version="1.0" encoding="utf-8"?>
<result>
<query id="181828">CONTEXT* AWARE* REASONING* MIDDLE* WARE* APPLIED* IN THE* MOBILE* ENVIRONMENT*</query>
<status code="200">OK</status>
<time unit="msecs">270.88</time>
<completions computed="1" sent="1" total="1">
<c dc="1" id="48913645" oc="1" sc="1">environment</c>
</completions>
<hits computed="1" first="0" sent="1" total="1">
<hit id="3605149" score="10">
<info><authors><author pid="96/2744">Jian Wu</author><author pid="69/1535">Chunping Li</author><author pid="122/3274">Yishu Miao</author><author pid="20/3883">Shaoxu Song</author><author pid="53/2189">Li Li</author><author pid="04/3021">Qiang Ding</author></authors><title>Context-aware reasoning middle ware applied in the mobile environment.</title><venue>ICMLC</venue><pages>1829-1835</pages><year>2013</year><type>Conference and Workshop Papers</type><access>closed</access><key>conf/icmlc/WuLMSLD13</key><doi>10.1109/ICMLC.2013.6890894</doi><ee>https://doi.org/10.1109/ICMLC.2013.689