## Checking versions
아래는 버전 확인을 위한 셀이며, 안돌아가면 돌리지 마세요..!

In [6]:
%load_ext version_information
import time
now = time.strftime("%Y-%m-%d %H:%M:%S (%Z = GMT%z)")
print(f"This notebook was generated at {now} ")

vv = %version_information requests, tqdm, pandas, astroquery, version_information
for i, pkg in enumerate(vv.packages):
    print(f"{i} {pkg[0]:10s} {pkg[1]:s}")

This notebook was generated at 2020-04-15 17:17:42 (KST = GMT+0900) 
0 Python     3.7.6 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]
1 IPython    7.13.0
2 OS         Darwin 18.7.0 x86_64 i386 64bit
3 requests   2.23.0
4 tqdm       4.43.0
5 pandas     1.0.3
6 astroquery 0.4.1.dev5959
7 version_information 1.0.3


## Importing and Setting Up

In [7]:
import math
import requests
import time
from itertools import product
from pathlib import Path
import pandas as pd
from astroquery import nasa_ads as na
from tqdm import tqdm

# helped from https://stackoverflow.com/questions/37573483/progress-bar-while-download-file-over-http-with-requests
def download_pdf(response, fpath):
    total_size = int(response.headers.get('content-length', 0)); 
    block_size = 1024
    wrote = 0 
    with open(fpath, 'wb') as f:
        for data in tqdm(response.iter_content(block_size), total=math.ceil(total_size//block_size), unit='kB', unit_scale=True):
            wrote = wrote + len(data)
            f.write(data)
#     if total_size != 0 and wrote != total_size:
#         print("ERROR, something went wrong")  

def altnames(fullname):
    names = [fullname]
    lastname = fullname.split(', ')[0]
    firstmiddle_names = fullname.split(', ')[-1].split(' ')
    N = len(firstmiddle_names)
    pieces = {'0':firstmiddle_names, '1':[]}  # 0/1 = full/initial
    
    for n in firstmiddle_names:
        pieces['1'].append('{}.'.format(n[0].upper()))
    
    for ind in product('012', repeat=N):
        altname = ''
        for i, case in enumerate(ind):
            if case != '2':
                altname += "{} ".format(pieces[case][i])
        if altname == '':
            continue
        names.append("{}, {}".format(lastname, altname[:-1]))    
    
    return list(set(names))

## Team Member Setting

**Define team members.**

``altnames``에 이니셜 등을 사용하는 경우가 자동으로 저장됩니다. 

* NOTE: Middle name can be either expanded or used as an initial. If the author always use full middle name in publications, it's better to give the full name here.

In [8]:
team = dict(
    names=["Bach, Yoonsoo P.", "Doe, John", "Steve, Parkinson", "First, Lastname Middlename"],
    kornames=["박윤수", "존도", "박인손", "내이름"],
    researcher_number=[1111, 1212, 3333, 4444],  # KRI 연구자등록번호
    altnames=[]  # Don't touch this
)

for name in team["names"]:
    team["altnames"].append(altnames(name))

team_df = pd.DataFrame.from_dict(team)
team_df

Unnamed: 0,names,kornames,researcher_number,altnames
0,"Bach, Yoonsoo P.",박윤수,1111,"[Bach, Yoonsoo, Bach, Y. P., Bach, P., Bach, Y..."
1,"Doe, John",존도,1212,"[Doe, J., Doe, John]"
2,"Steve, Parkinson",박인손,3333,"[Steve, Parkinson, Steve, P.]"
3,"First, Lastname Middlename",내이름,4444,"[First, Lastname Middlename, First, L. M., Fir..."


* **NOTE**: You may make many different such excel/csv/txt files and load them by ``pd.read_csv``, etc.

## Query to ADS

**IMPORTANT**: ADS에서 개인 API토큰을 발급받아야 합니다.

1. Go to [ADS](https://ui.adsabs.harvard.edu/), log in. 
2. Then go to [Account - Settings - API Token](https://ui.adsabs.harvard.edu/user/settings/token). 
3. Generate your token.
4. Copy and paste it to ``na.ADS.TOKEN`` below:

In [10]:
na.ADS.TOKEN = 'YourAPIToken!!!'

# by default, the top 10 records are returned, sorted in
# reverse chronological order. This can be changed

# change the number of rows returned
na.ADS.NROWS = 9999

# change the fields that are returned (enter as strings in a list)
na.ADS.ADS_FIELDS = ["title", "bibcode", "author", "pubdate", "property", "esources",
                     "pub", "issn", "volume", "issue", "page", "doi", "arxiv", "bibstem", "database"]

author = "Ishiguro, Masateru"
year = "2019-2020"
query_str = f'author:"={author}" year:{year}'
print(f"Query with: \n\t {query_str}")
results_raw = na.ADS.query_simple(query_str)

results_raw.sort(['pubdate', "title"])

# flatten the shape to convert to pandas... 
# I currently don't know what bad thing will happen.
# It was OK when I tested for my personal purposes.
for c in results_raw.colnames:
    if len(results_raw[c].shape) > 1:
        results_raw[c] = results_raw[c][:, 0]

results = results_raw.to_pandas()

results["N_author"] = results["author"].str.len()
results["YYYYMM"] = results["pubdate"].str[:-3].str.replace("-", "").astype(int)
results["refereed"] = [True if "REFEREED" in row["property"] else False for i, row in results.iterrows()]
results["astronomy"] = [True if "astronomy" in row["database"] else False for i, row in results.iterrows()]
results["volume"] = [-1 if row["volume"]==[None] else row["volume"] for i, row in results.iterrows()]

results_ref = results[((results["refereed"]==True) 
                      & (results["astronomy"]==True) 
                      & (results["volume"] != -1))]

print(f"ADS contains {len(results)} match with <{author}> (refreed: {len(results_ref)}) in {year}.")
if len(results_ref) > 5:
    print(f"\nHey {author}, you are awesome.")

Query with: 
	 author:"=Ishiguro, Masateru" year:2019-2020
ADS contains 14 match with <Ishiguro, Masateru> (refreed: 7) in 2019-2020.

Hey Ishiguro, Masateru, you are awesome.


In [11]:
df = results_raw.to_pandas()
df

Unnamed: 0,title,bibcode,author,pubdate,property,esources,pub,issn,volume,issue,page,doi,arxiv,bibstem,database
0,Hayabusa2 Mission Up to Now,2019LPI....50.2318T,"[Tsuda, Y., Yoshikawa, M., Watanabe, S., Nakaz...",2019-03-00,"[ESOURCE, TOC, NONARTICLE, NOT REFEREED, PUB_O...",[PUB_PDF],Lunar and Planetary Science Conference,,[None],2132,2318,,,LPI,[astronomy]
1,Hayabusa2's Multiband Disk-Integrated Photomet...,2019LPI....50.1943D,"[Domingue, D. L., Tatsumi, E., Yokota, Y., Sug...",2019-03-00,"[ESOURCE, TOC, NONARTICLE, NOT REFEREED, PUB_O...",[PUB_PDF],Lunar and Planetary Science Conference,,[None],2132,1943,,,LPI,[astronomy]
2,High Porosity Nature of the Top-Shape C-Type A...,2019LPI....50.1265W,"[Watanabe, S., Hirabayashi, M., Hirata, N., Hi...",2019-03-00,"[ESOURCE, TOC, NONARTICLE, NOT REFEREED, PUB_O...",[PUB_PDF],Lunar and Planetary Science Conference,,[None],2132,1265,,,LPI,[astronomy]
3,Rotational and Surface Properties of NEA 3200 ...,2019LPI....50.1497K,"[Kim, M. -J., Lee, H. -J., Kim, D. -H., Yoshid...",2019-03-00,"[ESOURCE, TOC, NONARTICLE, NOT REFEREED, PUB_O...",[PUB_PDF],Lunar and Planetary Science Conference,,[None],2132,1497,,,LPI,[astronomy]
4,Hayabusa2 arrives at the carbonaceous asteroid...,2019Sci...364..268W,"[Watanabe, S., Hirabayashi, M., Hirata, N., Hi...",2019-04-00,"[ESOURCE, ARTICLE, REFEREED]","[PUB_HTML, PUB_PDF]",Science,0036-8075,364,6437,268,10.1126/science.aav8032,,Sci,"[astronomy, physics, general]"
5,Shape and Rotational Motion Models for Tumblin...,2019AJ....157..155U,"[Urakawa, Seitaro, Ohsawa, Ryou, Sako, Shigeyu...",2019-04-00,"[DATA, ESOURCE, ARTICLE, REFEREED, EPRINT_OPEN...","[EPRINT_HTML, EPRINT_PDF, PUB_HTML, PUB_PDF]",The Astronomical Journal,0004-6256,157,4,155,10.3847/1538-3881/ab09f0,,AJ,[astronomy]
6,"The geomorphology, color, and thermal properti...",2019Sci...364..252S,"[Sugita, S., Honda, R., Morota, T., Kameda, S....",2019-04-00,"[ESOURCE, ARTICLE, REFEREED]","[PUB_HTML, PUB_PDF]",Science,0036-8075,364,6437,252,10.1126/science.aaw0422,,Sci,"[astronomy, general]"
7,VizieR Online Data Catalog: Observations of th...,2019yCat..51560223H,"[Hsieh, H. H., Ishiguro, M., Kim, Y., Knight, ...",2019-04-00,"[ASSOCIATED, DATA, NONARTICLE, NOT REFEREED]",[None],VizieR Online Data Catalog,,[None],[None],J/AJ/156/223,,,yCat,[astronomy]
8,The Geometric Albedo of (4179) Toutatis Estima...,2019JKAS...52...71B,"[Bach, Yoonsoo P., Ishiguro, Masateru, Jin, Su...",2019-06-00,"[ESOURCE, ARTICLE, REFEREED, ADS_OPENACCESS, E...","[ADS_PDF, ADS_SCAN, EPRINT_HTML, EPRINT_PDF, P...",Journal of Korean Astronomical Society,1225-4614,52,3,71,10.5303/JKAS.2019.52.3.71,,JKAS,[astronomy]
9,Hayabusa2: Current Summary,2019LPICo2157.6306N,"[Nakamura, T., Watanabe, S., Hirabayashi, M., ...",2019-07-00,"[ESOURCE, TOC, ARTICLE, NOT REFEREED, PUB_OPEN...",[PUB_PDF],82nd Annual Meeting of The Meteoritical Society,0161-5297,82,2157,6306,,,LPICo,[astronomy]


* **NOTE**: If you want to search for your results, change the ``query_str``.
* **NOTE**: See http://adsabs.github.io/help/search/comprehensive-solr-term-list for the complete list of columns.
* ~~**NOTE**: As of 2019-07-02, the ``issn`` is not yet supported from ADS.~~ It seems like it's now supported (2020-04-15)

## Select Rows for This BK Survey
I will select those with ``201908 <= YYYYMM <= 202001``. Also, based on the columns of the Excel file from BK21 office, I will only select the 

1. title
2. journal (full name) ``pub``
3. doi 
3. issn 
4. volume 
5. issue
6. page
7. YYYYMM
8. number of authors 

in this order. Then add the students' names and their corresponding KRI researcher numbers.

It will be saved as ``BKoutput.csv`` and you can open it with Excel, copy-and-paste to the original Excel file.
* **WARNING**: The formatting is crazy in the original Excel from BK (it got better in 2020 but it was horrible in 2019), so you should do it by yourself.

In [12]:
results_ref = results_ref[(results_ref["YYYYMM"] >= 201908) & (results_ref["YYYYMM"] <= 202001)]
results_ref = results_ref[
    ["author", "title", 'doi', "pub", "issn", "volume", "issue", "page", "YYYYMM", "N_author"]
]
results_ref["students"] = ""
results_ref["researcher_number"] = ""


for i, row in results_ref.iterrows():
    students = ""
    researcher_number = ""
    for _, student in team_df.iterrows():
        student_names = student["altnames"]
        for name in student_names:
            if name in row["author"]:
                students += "{},".format(student["kornames"])
                researcher_number += "{},".format(student["researcher_number"])
    results_ref.at[i, "students"] = students[:-1]
    results_ref.at[i, "researcher_number"] = researcher_number[:-1]
    

del results_ref["author"]
results_ref.to_csv("BKoutput.csv", index=False)

In [13]:
results_ref

Unnamed: 0,title,doi,pub,issn,volume,issue,page,YYYYMM,N_author,students,researcher_number
10,Near-infrared polarimetric study of near-Earth...,10.1051/0004-6361/201935542,Astronomy and Astrophysics,0004-6361,629,[None],A121,201909,10,,
11,Q-type asteroids: Possibility of non-fresh wea...,10.1093/pasj/psz088,Publications of the Astronomical Society of Japan,0004-6264,71,5,103,201910,7,,
12,Polarimetric and photometric observations of N...,10.1016/j.pss.2019.104774,Planetary and Space Science,0032-0633,180,[None],104774,202001,9,,


## Download the PDF Files of the Papers
I will use the ADS web link and try
1. to access to the publisher's PDF if available
  - For Science, the publisher's PDF link is not directed to the full pdf, so I added some conditional clause.
2. if unavailable, I tried something
  - Nature, for example, adding ``.pdf`` seem to direct you to the pdf.
  
As time goes, I will add more exceptions so that it works as perfect as possible.

In [8]:
BASE = "https://ui.adsabs.harvard.edu/link_gateway/"
# helped from https://stackoverflow.com/questions/43165341/python3-requests-connectionerror-connection-aborted-oserror104-econnr/43167631
manual = dict(bib=[], pub_html=[])
headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

for i, row in results_ref.iterrows():
    bib = row["bibcode"]
    fpath = Path('{}.pdf'.format(bib))
    print(fpath, end=' ')
    
    if fpath.exists():
        print('already exists!'.format(bib))
        continue
        
    if "PUB_PDF" in row["esources"]:
        url = BASE + row["bibcode"] + "/PUB_PDF"
        print('Downloading...'.format(bib), end=' ')

        response = requests.get(url, headers=headers, stream=True)
        
        if "Science" in row["pub"]:
            if response.url.endswith("/tab-pdf"):
                url = response.url.replace("/tab-pdf", ".full.pdf")
            else:
                url = response.url + ".full.pdf"
            response = requests.get(url, headers=headers, stream=True)

        print("\n\t" + response.url)
        time.sleep(1)
        
        download_pdf(response, fpath)

    else:
        try:
            print("trying to find pdf...", end=' ')
            url = BASE + row["bibcode"] + "/PUB_HTML"
            response = requests.get(url, headers=headers, stream=True)
            if "nature.com" in response.url:
                url = response.url + ".pdf"
            else:
                raise ConnectionError()
            response = requests.get(url, headers=headers, stream=True)
            if response.status_code == 404:
                raise ConnectionError()
            print('I found it! Downloading...'.format(bib), end=' ')
            print("\n\t" + response.url)
            time.sleep(1)
        
            download_pdf(response, fpath)
            
        except ConnectionError:            
            print("\n!!! I couldn't find a valid link. Download from below:".format(bib))
            print("\t" + BASE + bib + "/PUB_HTML")
            manual["bib"].append(bib)
            manual["pub_html"].append(BASE + bib + "/PUB_HTML")

2019A&A...629A.121K.pdf already exists!
2019PASJ...71..103H.pdf Downloading... 
	https://watermark.silverchair.com/psz088.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAAncwggJzBgkqhkiG9w0BBwagggJkMIICYAIBADCCAlkGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM-CRUZSReuBdSUAoHAgEQgIICKnjGi4lR2u3XZpA0eSdXpoSSKJ-XByaEhCE5FlNodjn-q6Vh1LF3NVfVFhODUjquNkknew9EUo8zHTaDus2ckKBOlwIUNrCYOzaQEjw8XzrvTmE_hdyGuZLmAymbXJ7DHLoqkEIu3-YV3yKqfsPeUa38LRspSqqBoFPNVaOn4hdAqqvOPfwnJxeaaTZSYDNygygnOpA_6SJNvtnhRrLAzIE3MdocprrJS68zE5F9wl0nzjP6x5Eg4dxWpLol32zfAFX-typF5cyUS3lOAL5x9ESNBYrzFszxh1I03OK0iVEk0F3q5lZCSOp7UOtw5qDiB0rTLwYGN6Ln1hB3hptG4IJXCl8P4Txr3R7DqzqpbxGWxmS_9w7C-66xa4_NcysAo74Ir_9ncCAF7GfLz5bP6LFu06GyHTuRH5ugl0iDa01yNhcbzizcVmRDZMbYx4xBiYoHVy5DBS8m3b_TMRxeOi5m-oLNyOrTNU_9Bw5Z-xwnHwL6zB03h9Yn_8zow9ghLHl2LqNgEqR5PElUA5nOY7WcWPz-Uz-l_FJJU24HvIidK28QY4e5FDcvNC30ngtzW0PY5I4ShI1EXGxAKgZJHLRLn8gaWOiVKRxN0VIYM7QPaj3UktUlyOToSVls6ghlOKywjQwJqKge_IyOcwCfV4eVV_geBULCY6Cg2K1PypaEE68SozYsv_xzAPXPwJsRZZzN_qmk

1.59kkB [00:12, 124kB/s]                           


2020P&SS..18004774O.pdf trying to find pdf... 
!!! I couldn't find a valid link. Download from below:
	https://ui.adsabs.harvard.edu/link_gateway/2020P&SS..18004774O/PUB_HTML


* **WARNING**: You may have some papers that are accepted but not on ADS yet. You **MUST** find those by yourself!!!
* **NOTE**: I didn't put much effort to automatize the "paper download link finding" algorithm. But anyway it gives the link to PDF download, it may save a lot of time.

In [9]:
# Papers you have to download manually...
import pandas as pd
pd.DataFrame(manual)

Unnamed: 0,bib,pub_html
0,2020P&SS..18004774O,https://ui.adsabs.harvard.edu/link_gateway/202...
