In [1]:
import warnings
warnings.filterwarnings("ignore")

from utils import dataset, load_bow
import pandas as pd
from datetime import datetime
from itertools import chain
import numpy as np
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import time
from names_matcher import NamesMatcher
from tqdm import tqdm
import random
import nest_asyncio
nest_asyncio.apply()

In [46]:
def find_author(name, authors):
    return NamesMatcher()([name], authors)[0][0]


def make_request(session, pub, retries=10, backoff_factor=1):
    url = f'https://dblp.org/search/publ/api?q={pub}&format=xml'
    
    for retry in range(retries):
        try:
            time.sleep(backoff_factor)  # Wait for at least a second between requests
            response = session.get(url)
            if response.status_code == 429:  # Too Many Requests
                raise requests.exceptions.RequestException("Too Many Requests")
            return response.text
        except requests.exceptions.RequestException as e:
            if retry < retries - 1:
                sleep_time = backoff_factor * (2 ** retry) + random.uniform(0, 0.1)
                time.sleep(sleep_time)
                continue
            else:
                raise e
from collections import Counter           
async def start_async_process():
    with ThreadPoolExecutor(max_workers=6) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [loop.run_in_executor(executor, make_request, *(session,f'{i} {name}')) for i in pubs]
            pids = []
            for response in await asyncio.gather(*tasks):
                soup = BeautifulSoup(response, 'xml')
                retries = 0
                while (hits := soup.find('hits')) and hits['computed'] == '0' and retries < 10:
                    pub = random.sample(eval(top.loc[name]['Papers']), 1)
                    if retries > 5:
                        req = make_request(session, f'{pub}')
                    else:
                        req = make_request(session, f'{pub} {name}')
                    soup = BeautifulSoup(req, 'xml')
                    retries += 1
                author_pids = list(map(lambda x: x['pid'] ,soup.findAll('author')))
                author_names = list(map(lambda x: x.text ,soup.findAll('author')))
                if len(author_pids) > 0:
                    pids.append(author_pids[find_author(name, author_names)].split('-')[0])
            if pids != []:
                pids_count = Counter(pids)
                top.loc[name, 'pids'] = pids_count.most_common(1)[0][0]
                top.loc[name, 'names'] = str(pids)

In [79]:
top = pd.read_csv('./database/top_authors/top1000authors.csv')
top.set_index('Unnamed: 0', inplace=True)
missing = top[top['pids'].isna()].index
top['names'] = np.nan

for name in tqdm(missing):
    pubs = random.sample(eval(top.loc[name]['Papers']), 11)
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(start_async_process())
    asyncio.run(future)

still_missing = top[top['pids'].isna()].index
for name in tqdm(still_missing):
    pubs = random.sample(eval(top.loc[name]['Papers']), 11)
    pids = []
    with requests.Session() as session:
        for pub in pubs:
            req = make_request(session, f'{pub}')
            soup = BeautifulSoup(req, 'xml')
            retries = 0
            while (hits := soup.find('hits')) and hits['computed'] == '0' and retries < 10:
                pub = random.sample(eval(top.loc[name]['Papers']), 1)
                if retries > 5:
                    req = make_request(session, f'{pub}')
                else:
                    req = make_request(session, f'{pub} {name}')
                soup = BeautifulSoup(req, 'xml')
                retries += 1
            author_pids = list(map(lambda x: x['pid'] ,soup.findAll('author')))
            author_names = list(map(lambda x: x.text ,soup.findAll('author')))
            if len(author_pids) > 0:
                pids.append(author_pids[find_author(name, author_names)].split('-')[0])

    if pids != []:
        pids_count = Counter(pids)
        top.loc[name, 'pids'] = pids_count.most_common(1)[0][0]
        top.loc[name, 'names'] = str(pids)

top.to_csv('./database/1000authors_pids.csv')

100%|██████████| 895/895 [6:28:58<00:00, 26.08s/it]   
0it [00:00, ?it/s]


In [40]:
top = pd.read_csv('./database/top1000authors.csv')
top['names'] = top['names'].apply(lambda x: eval(x) if type(x) == str else x)
top['duplicate'] = top['names'].apply(lambda x: len(np.unique(x)) == 1 if type(x) == list else np.nan)
top['Papers'] = top['Papers'].apply(lambda x: eval(x))
top.set_index('Unnamed: 0', inplace=True)

In [81]:
i = 2
print(top.iloc[np.where(top['duplicate'] == False)].iloc[i])
print(top.iloc[np.where(top['duplicate'] == False)].iloc[i]['names'])
eval(top.iloc[np.where(top['duplicate'] == False)].iloc[i]['Papers'])

Papers          ['Image Fusion with Local Spectral Consistency...
publications                                                   65
pids                                                      65/4423
names           [65/4423, 65/4423, 65/4423, 67/6383, 65/4423, ...
duplicate                                                   False
Name: CHEN, CHEN, dtype: object
['65/4423', '65/4423', '65/4423', '67/6383', '65/4423', '65/4423', '65/4423', '67/6383', '65/4423', '65/4423', '65/4423']


['Image Fusion with Local Spectral Consistency and Dynamic Gradient Sparsity',
 'Preconditioning for Accelerated Iteratively Reweighted Least Squares in Structured Sparsity Reconstruction',
 'Chinese Overt Pronoun Resolution: A Bilingual Approach',
 'Chinese Zero Pronoun Resolution: An Unsupervised Approach Combining Ranking and Integer Linear Programming',
 'Sub-Selective Quantization for Large-Scale Image Search',
 'Privacy preserving growing neural gas over arbitrarily partitioned data',
 'Chinese Common Noun Phrase Resolution: An Unsupervised Probabilistic Model Rivaling Supervised Resolvers',
 'The graph based semi-supervised algorithm with l(1)-regularizer',
 'Deep Sparse Representation for Robust Image Registration',
 'Chinese Zero Pronoun Resolution: A Joint Unsupervised Discourse-Aware Model Rivaling State-of-the-Art Resolvers',
 'Robust Image Segmentation Using Contour-guided Color Palettes',
 'Analog circuit fault diagnosis based UCISVM',
 'Joint Inference over a Lightly Sup

In [31]:
list(venues.iloc[np.where(venues['Article Title'] == 'Low-Resolution Gait Recognition')]['Author Full Names'])

['Zhang, Junping; Pu, Jian; Chen, Changyou; Fleischer, Rudolf']

In [22]:
eval(top.iloc[np.where(top['duplicate'] == False)].iloc[i]['Papers'])

['Distance Approximating Dimension Reduction of Riemannian Manifolds',
 'Low-Resolution Gait Recognition',
 'Robust Bayesian Max-Margin Clustering',
 'Bayesian Sampling Using Stochastic Gradient Thermostats',
 'Differential Topic Models',
 'Scalable Deep Poisson Factor Analysis for Topic Modeling',
 'On the Convergence of Stochastic Gradient MCMC Algorithms with High-Order Integrators',
 'High-Order Stochastic Gradient Thermostats for Bayesian Learning of Deep Models',
 'Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural Networks',
 'Bridging the Gap between Stochastic Gradient MCMC and Stochastic Optimization',
 'Learning Weight Uncertainty with Stochastic Gradient MCMC for Shape Classification',
 'Nonlinear Statistical Learning with Truncated Gaussian Graphical Models',
 'Learning Structured Weight Uncertainty in Bayesian Neural Networks',
 'Scalable Bayesian Learning of Recurrent Neural Networks for Language Modeling',
 'Stochastic Gradient Monomial Gamma Sampler',

In [80]:
top[top.duplicated(subset=['pids'], keep=False)].sort_values('pids')

Unnamed: 0_level_0,Papers,publications,pids,names
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"WANG, XIZHAO",['Reduction of attributes in ordinal decision ...,38,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"WANG, XZ",['Neural network based fault diagnosis using u...,68,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"WANG, XI-ZHAO",['A research on weight acquisition of weighted...,47,02/4027,"['02/4027', '02/4027', '02/4027', '02/4027', '..."
"ZHU, SC","['Region competition: Unifying snakes, region ...",37,10/10313,"['10/10313', '10/10313', '10/10313', '10/10313..."
"ZHU, SONG-CHUN",['Compositional boosting for computing hierarc...,127,10/10313,"['10/10313', '10/10313', '10/10313', '10/10313..."
"YEUNG, DANIEL S.",['Sensitivity analysis of Madalines to weight ...,49,36/896,"['36/896', '36/896', '36/896', '36/896', '36/8..."
"YEUNG, DS",['FUZZY KNOWLEDGE REPRESENTATION AND REASONING...,47,36/896,"['36/896', '36/896', '36/896', '36/896', '36/8..."
YOSHUA BENGIO,['A Structured Self-Attentive Sentence Embeddi...,61,56/953,"['56/953', '56/953', '56/953', '56/953', '56/9..."
"BENGIO, YOSHUA",['Exploring Strategies for Training Deep Neura...,123,56/953,"['56/953', '56/953', '56/953', '56/953', '56/9..."
"LEVINE, SERGEY",['Learning Complex Neural Network Policies wit...,64,80/7594,"['80/7594', '80/7594', '80/7594', '80/7594', '..."


In [33]:
i = -1

In [42]:
# top = pd.read_csv('./database/top1000authors.csv').sort_values(by='publications', ascending=False)
# top.set_index('Unnamed: 0', inplace=True)
name = 'CHEN, CHANGYOU'
i += 1
pub = eval(top.loc[name]['Papers'])[i]
url = f'https://dblp.org/search/publ/api?q={pub}&format=xml'
request = requests.get(url)

soup = BeautifulSoup(request.text, 'xml')
author_pids = list(map(lambda x: f"{x['pid']} | {x.text}" ,soup.findAll('author')))
#author_pids[find_author(name, author_pids)]
soup

<?xml version="1.0" encoding="utf-8"?>
<result>
<query id="161206">Low* Resolution* Gait* Recognition*</query>
<status code="200">OK</status>
<time unit="msecs">28.93</time>
<completions computed="1" sent="1" total="1">
<c dc="5" id="49758263" oc="5" sc="5">recognition</c>
</completions>
<hits computed="5" first="0" sent="5" total="5">
<hit id="4842932" score="5">
<info><authors><author pid="02/5388">Junping Zhang</author><author pid="15/5135">Yuan Cheng</author><author pid="65/2802">Changyou Chen</author></authors><title>Low Resolution Gait Recognition with High Frequency Super Resolution.</title><venue>PRICAI</venue><pages>533-543</pages><year>2008</year><type>Conference and Workshop Papers</type><access>closed</access><key>conf/pricai/ZhangCC08</key><doi>10.1007/978-3-540-89197-0_49</doi><ee>https://doi.org/10.1007/978-3-540-89197-0_49</ee><url>https://dblp.org/rec/conf/pricai/ZhangCC08</url></info>
<url>URL#4842932</url>
</hit>
<hit id="1229172" score="4">
<info><authors><author pi

In [53]:
name

'CHEN, CHANGYOU'

In [97]:
#authors = list(map(lambda x: f"{x['pid']}|||{x.text}" ,soup.findAll('author')))
name = 'CHEN, CHANGYOU'
NamesMatcher()([name], ['15/5135|||Yuan Cheng', '86/6151|||Yu Guan', '65/2802|||Changyou Chen', '02/5388|||Junping Zhang', '242/7240|||Viet-Ha Ho', '138/9974|||Huu-Hung Huynh', '115/7174|||Sruti Das Choudhury'])[0][0]

2

In [91]:
authors

['02/5388|||Junping Zhang',
 '15/5135|||Yuan Cheng',
 '65/2802|||Changyou Chen',
 '242/7240|||Viet-Ha Ho',
 '138/9974|||Huu-Hung Huynh',
 '115/7174|||Sruti Das Choudhury',
 '86/6151|||Yu Guan',
 'l/ChangTsunLi|||Chang-Tsun Li',
 '39/9606|||Naoki Akae',
 '75/113|||Yasushi Makihara',
 '67/6296|||Yasushi Yagi',
 '02/5388|||Junping Zhang',
 '43/6295|||Jian Pu',
 '65/2802|||Changyou Chen',
 'f/RudolfFleischer|||Rudolf Fleischer']