In [1]:
import time
import requests

import Levenshtein
from tqdm import tqdm
from bs4 import BeautifulSoup
from googlesearch import search


def _google_search(query, limit=10):
    return list(search(query, lang="en", num=5, stop=3))

def _extract_abstract(paper, true_title):
    paper_soup = BeautifulSoup(paper.content, 'html5lib')
    paper_title = paper_soup.find('h1', class_="title mathjax").contents[1].replace('\n', '')
    if Levenshtein.distance(paper_title.lower(), true_title.lower()) > (len(paper_title) / 5):  # 論文名が違う場合、それは対象外とする
        return None

    paper_abst = paper_soup.find('blockquote', class_='abstract mathjax').contents[-1].replace('\n', ' ')
    return paper_abst

def _to_txtfile(title, abst, out_txt_filepath):
    if '/' in title:
        title = title.replace('/', '-')
    with open(f'{out_txt_filepath}/{title}.txt', 'w') as f:
        f.write(abst)

def google_search_and_extract_abst(title, out_txt_filepath):
    result_urls = _google_search(title)
    for result_url in result_urls:
        if 'arxiv.org/abs/' not in result_url:
            continue
            
        paper = requests.get(result_url)
        abst = _extract_abstract(paper, title)
        if abst:
            _to_txtfile(title, abst, out_txt_filepath)
            return 'success'
    return 'failed'

def fetch_abst_by_conference(in_txt_filepath, out_txt_filepath):
    with open(in_txt_filepath, 'r') as f:
        titles = f.read()
        titles = titles.split('\n')
    
    fetch_ok_list = []
    
    for title in tqdm(titles):
        try:
            result = google_search_and_extract_abst(title, out_txt_filepath)
            time.sleep(10)  # 長い？
            if result == 'success':
                fetch_ok_list.append(title)
        except Exception as e:
            print(e)
            return fetch_ok_list
        
    return fetch_ok_list

In [2]:
conf_name = 'emnlp_2015'

in_txt_filepath = f'./text/{conf_name}.txt'
out_txt_filepath = f'./text/abst_text/{conf_name}'

In [3]:
fetch_ok_list = fetch_abst_by_conference(in_txt_filepath, out_txt_filepath)

100%|██████████| 331/331 [1:12:23<00:00, 13.12s/it]


In [10]:
len(fetch_ok_list)

38

- acl_2017: 146