# scoring

In [None]:
import pandas as pd
from time import time, sleep
from typing import Dict, List, Set
from tqdm.notebook import tqdm
from scholar import S2Api, AuthorRelated, CitationRelevance

In [None]:
COLUMNS = [
    # 基本信息
    "paper_id",
    "title",
    "year",
    "abstract",
    "contexts",
    "cited_id",
    "cited_title",
    "cited_year",
    "cited_abstract",
    "relevance_cosine",
    "relevance_cross",
    "relevance_cross_abstract",
    "author_independence"
]
scoredf = pd.DataFrame(columns=COLUMNS)
api = S2Api()
related = AuthorRelated()
relevance = CitationRelevance()


In [None]:
sample = pd.read_excel('sample.xlsx')

In [None]:
prearr = sample['paperId'].tolist()
cited_bar = tqdm(total=len(prearr), desc="Processing cited")

while len(prearr):
    id = prearr[0]
    cited = api.paper_detail(id)
    cited_bar.set_postfix({"cited": cited.get('title')})
    cited_bar.update()
    abstract = cited.get('abstract', '')
    if cited and abstract:
        df = pd.DataFrame(columns=COLUMNS)
        citations = api.paper_citations(id)
        citing_bar = tqdm(total=len(citations), desc=f"cited: {cited.get('title')}", leave=False)
        for citation in citations:
            citing = citation.get('citingPaper', {})
            contexts = citation.get('contexts', [])
            citing_bar.set_postfix({"citing": citing.get('title')})
            if contexts and len(contexts) > 0:
                cosine = relevance.compute_relevance(contexts, abstract, 'cosine')
                cross = relevance.compute_relevance(contexts, abstract)
                cross_abstract = relevance.compute_relevance(citing.get('abstract', ''), abstract)
                relatedness = related.compute_author_distance_score(citing, cited)
                df.loc[len(df)] = {
                    "paper_id":citing.get('paperId'),
                    "title":citing.get('title'),
                    "year":citing.get('year'),
                    "abstract":citing.get('abstract'),
                    "contexts":citation.get('contexts', []),
                    "cited_id":cited.get('paperId'),
                    "cited_title":cited.get('title'),
                    "cited_year":cited.get('year'),
                    "cited_abstract":abstract,
                    "relevance_cosine": cosine,
                    "relevance_cross":cross,
                    "relevance_cross_abstract":cross_abstract,
                    "author_independence":relatedness
                }
            citing_bar.update()
            sleep(2)
        citing_bar.close()
        if len(df)>0:
            scoredf = pd.concat([scoredf,df])
    complete = prearr.pop(0)
    scoredf.to_excel('./score.xlsx')
    sleep(2)
cited_bar.close()