In [None]:
from my_scientific_profile.orcid.works import get_put_code_to_doi_map
from my_scientific_profile.crossref.works import get_crossref_work_by_doi
import pandas as pd
from dataclasses import asdict
from fuzzywuzzy.fuzz import ratio

## Query and save papers

- Use ORCID to get the list of DOIs
- Use CrossRef to get paper infos for each DOI
- Use Semantic Scholar on each DOI to try to get TLDR
- Use Unpaywall to get Open Access info / preprint
- Use doi2bib to get bib on each DOI

Fields to keep:
- Title
- Journal name
- Year
- DOI
- Authors
- bib entry
- (preprint)
- (TLDR)



In [None]:
dois = list(set(list(get_put_code_to_doi_map().values())))

In [None]:
work = get_crossref_work_by_doi(dois[0])
work.message

In [None]:
pd.json_normalize([asdict(get_crossref_work_by_doi(doi).message) for doi in dois[:2]])

In [None]:
# for doi in dois:
#     for author in get_crossref_work_by_doi(doi).message.author:
#         # print(author)

In [None]:
authors = [author for doi in dois for author in get_crossref_work_by_doi(doi).message.author]
authors

In [None]:
df = pd.json_normalize([asdict(a) for a in authors]).sort_values("family")
df

In [None]:
df_sub = df[["given", "family", "orcid"]].groupby(by=["given", "family"]).first().sort_values("family")
df_sub

In [None]:
df_wo_orcid = df_sub.loc[pd.isna(df_sub.orcid)].reset_index().sort_values('family')
df_wo_orcid['full_name'] = df_wo_orcid['given'] + " " + df_wo_orcid['family']
df_wo_orcid

In [None]:
df_w_orcid = df_sub.loc[pd.notna(df_sub.orcid)].reset_index().sort_values('family')
df_w_orcid['full_name'] = df_w_orcid['given'] + " " + df_w_orcid['family']
df_w_orcid

In [None]:
ratio("J. F. Rudzinski", "Joseph F. Ruszinski"), ratio("Joseph F Rudzinski", "Joseph F. Ruszinski"), ratio("J Rudzinski", "Joseph Rudzinski")

In [None]:
df.groupby(by=["given", "family"]).first().drop(["sequence", "authenticated_orcid"], axis=1).head(60).sort_values("family")

In [None]:
df = pd.json_normalize([asdict(get_crossref_work_by_doi(doi).message) for doi in dois[:2]])[["doi", "title", "short_container_title", "is_referenced_by_count"]]
df

In [None]:
pd.json_normalize([asdict(a) for doi in dois[:10] for a in get_work_by_doi(doi).message.author]).sort_values("family")