# キーワードに対してLegend論文とBuzz論文を表示
## モチベーション
初学者をターゲットに，読んでおくべきLegend論文と最近バズっているBuzz論文を提示する

In [1]:
import re

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### キーワード, Buzz論文の年数範囲を設定
`year_lo`で指定した年数から今までの論文をBuzz論文として探す

In [2]:
keyword="machine learning"
keyword = keyword.replace(" ", "+")
number = 3
year_lo = 2019

### Legend論文を探索


In [3]:
html_legend = requests.get(
    "https://scholar.google.co.jp/scholar?&hl=ja&as_sdt=0%2C5as_vis=1&num="
    + str(number)
    + "&q="
    + keyword
).text
html_year = requests.get(
    "https://scholar.google.co.jp/scholar?as_ylo="
    + str(year_lo)
    + "&hl=ja&as_sdt=0%2C5as_vis=1&num="
    + str(number)
    + "&q="
    + keyword
).text
soupl = BeautifulSoup(html_legend, "html.parser")
tags1 = soupl.find_all("h3",{"class": "gs_rt"})
tags2 = soupl.find_all("div", {"class": "gs_a"})
tags3 = soupl.find_all(text=re.compile("引用元"))
tags4 = soupl.find_all("div", {"class": "gs_fl"})

### 情報を整理して表示

In [4]:
title_list = []
id_list = []
author_list = []
year_list = []
ci_num_list = []
for i in range(number):
    title = re.sub(r"\[(PDF|書籍|B|HTML)\]", "", tags1[i].text)
    title = "_".join(title.split(" "))
    if title[0] == "_":
        title = title[1:]
    title_list.append(title)
    author = tags2[i].text
    year = re.sub(r"\D", "", author)
    author = re.sub(r'-.*','', author)
    year_list.append(year[0:4])
    author = re.sub(r"\d", "", author)
    author_list.append(author)
    ci_num_list.append(tags3[i].replace("引用元", ""))
    p_id = re.search('cites=[0-9]*&|amp;d=[0-9]*&', str(tags4[i * 2]))
    id_list.append(re.sub(r'\D', '', p_id.group(0)))

In [5]:
print("Legend Papers")
for i in range(number):
    print('--------------------------------------------------')
    print("Title: ",title_list[i])
    print("Author: ",author_list[i])
    print("Published  year: ", year_list[i])
    print("Number of　citation: ", ci_num_list[i])
    print("Paper ID: ", id_list[i])
    print('--------------------------------------------------')

Legend Papers
--------------------------------------------------
Title:  Machine_learning
Author:  D Michie, DJ Spiegelhalter… 
Published  year:  1994
Number of　citation:   4095
Paper ID:  15792719233775414370
--------------------------------------------------
--------------------------------------------------
Title:  Machine_learning:_Trends,_perspectives,_and_prospects
Author:  MI Jordan, TM Mitchell 
Published  year:  2015
Number of　citation:   2220
Paper ID:  10883068066968164261
--------------------------------------------------
--------------------------------------------------
Title:  Foundations_of_machine_learning
Author:  M Mohri, A Rostamizadeh, A Talwalkar 
Published  year:  2018
Number of　citation:   2634
Paper ID:  12992542314861780001
--------------------------------------------------


### Buzz論文を探索, 情報整理, 表示

In [6]:
soupy = BeautifulSoup(html_year, "html.parser")
tags1 = soupy.find_all("h3",{"class": "gs_rt"})
tags2 = soupy.find_all("div", {"class": "gs_a"})
tags3 = soupy.find_all(text=re.compile("引用元"))
tags4 = soupy.find_all("div", {"class": "gs_fl"})

title_listy = []
id_listy = []
author_listy = []
year_listy = []
ci_num_listy = []
for i in range(number):
    title = re.sub(r"\[(PDF|書籍|B|HTML)\]", "", tags1[i].text)
    title = "_".join(title.split(" "))
    if title[0] == "_":
        title = title[1:]
    title_listy.append(title)
    author = tags2[i].text
    year = re.sub(r"\D", "", author)
    author = re.sub(r'-.*','', author)
    year_listy.append(year[0:4])
    author = re.sub(r"\d", "", author)
    author_listy.append(author)
    ci_num_listy.append(tags3[i].replace("引用元", ""))
    p_id = re.search('cites=[0-9]*&|amp;d=[0-9]*&', str(tags4[i * 2]))
    id_listy.append(re.sub(r'\D', '', p_id.group(0)))

In [7]:
print("Buzz Papers")
for i in range(number):
    print('--------------------------------------------------')
    print("Title: ",title_listy[i])
    print("Author: ",author_listy[i])
    print("Published  year: ", year_listy[i])
    print("Number of　citation: ", ci_num_listy[i])
    print("Paper ID: ", id_listy[i])
    print('--------------------------------------------------')

Buzz Papers
--------------------------------------------------
Title:  Machine_learning_and_the_physical_sciences
Author:  G Carleo, I Cirac, K Cranmer, L Daudet, M Schuld… 
Published  year:  2019
Number of　citation:   302
Paper ID:  3129578775481411450
--------------------------------------------------
--------------------------------------------------
Title:  Machine_learning_in_materials_science
Author:  J Wei, X Chu, XY Sun, K Xu, HX Deng, J Chen, Z Wei… 
Published  year:  2019
Number of　citation:   42
Paper ID:  2920469230596362832
--------------------------------------------------
--------------------------------------------------
Title:  Interpretable_Machine_Learning
Author:  C Molnar 
Published  year:  2020
Number of　citation:   329
Paper ID:  18424866663209148682
--------------------------------------------------
