In [1]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

import re

# from access_biology_data import annotation
# from access_biology_data import meta

from access_literature_data import medline, wos

# MedLine and Web of Science

Load MedLine and Web of Science data. Note that this might take some minutes the first time.

In [2]:
medline_wos = medline.select_medline_wos_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            medline.amount_of_authors,
            medline.j_name_s,
            ut2pmid.ut AS wos_id''',
    years_range='all',      # load all years
    taxon_id=9606,          # only look at human publications (taxon_id for human is 9606)
    kind='research',        # only look at research publications (not reviews etc.)
    unambiguous=True,       # only load data that can be mapped unambigously between web of science and MedLine
)

In [3]:
medline_wos.head()

Unnamed: 0,pubmed_id,pubdate_year,amount_of_authors,j_name_s,wos_id,index,ut,citation_year_1933,citation_year_1934,citation_year_1935,...,citation_year_2008,citation_year_2009,citation_year_2010,citation_year_2011,citation_year_2012,citation_year_2013,citation_year_2014,citation_year_2015,citation_year_2016,citation_year_2017
0,3,1975,2.0,Biochem. Biophys. Res. Commun.,A1975AT31100030,1,A1975AT31100030,0,0,0,...,0,0,0,0,0,1,1,1,2,0
1,82,1975,2.0,Biochim. Biophys. Acta,A1975AX62800004,3,A1975AX62800004,0,0,0,...,0,1,0,0,1,1,0,0,0,0
2,154,1975,2.0,Circ. Res.,A1975AW03800008,5,A1975AW03800008,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,454,1975,2.0,J. Exp. Zool.,A1975AW34100007,10,A1975AW34100007,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2009,1976,2.0,Am. J. Hum. Genet.,A1976BC88800002,18,A1976BC88800002,0,0,0,...,0,0,1,0,0,1,2,1,0,0


Load disambiguated authorship information

# Disambiguated Authorship Data

In [4]:
dais = wos.dais(
    subset='gene-linked',
    allowed_dais='all',
    allowed_wos=medline_wos.loc[:, 'wos_id'])

In [5]:
dais.head()

Unnamed: 0,dais_id,wos_id,position
7,14792572,A1946XY13000001,1
27,1873989,000387159500010,1
28,2006130956,000387159500010,2
29,2006308348,000387159500010,3
30,2006150243,000387159500010,4


# Citations

Support function (copied from resci of Thomas' repository)

In [6]:
def count_citations(
        df_query, earliest_year_after_publication=1, years_to_include=3):

    """
    Counts number of citations following year of publication

    Input:
        df  DataFrame, containing 'pubate_year' and 'citation_year_'s
        earliest_year_after_publication     int First year after publication
                                            that will be included in counting
                                            citations
        years_to_include    int  Amount of years that shall be included
                                    when counting citations (first year will
                                    be the one indicated by
                                    earliest_year_after_publication)

    Output:
        df_citations_over_span  DataFrame with Citations over requested
                                    time span
    """

    # It appears that 2016 might isn't fully indexed in WoS yet
    last_complete_year = 2015

    lead_pattern = 'citation_year_'
    f = [x.startswith(lead_pattern) for x in df_query.columns]

    df_citations = df_query.loc[:, f]
    df_pubdate_year = df_query.loc[:, ['pubdate_year']]

    df_citations.columns = [
        int(re.findall('{}(.*)$'.format(lead_pattern), x)[0]
            ) for x in df_citations.columns]

    df_o = pd.DataFrame(index=df_citations.index, columns=['citations'])

    all_available_years = set(df_citations.columns)

    for y in df_citations.columns:
        earliest_year = y + earliest_year_after_publication
        last_year = earliest_year + years_to_include

        f = df_pubdate_year['pubdate_year'] == y

        ideal_years_to_query = set(np.arange(earliest_year, last_year))
        years_to_query = ideal_years_to_query.intersection(all_available_years)
        years_to_query = sorted(list(years_to_query))

        if earliest_year <= last_complete_year:
            if max(years_to_query) <= df_pubdate_year['pubdate_year'].max():
                if max(years_to_query) <= last_complete_year:
                    dff = df_citations.loc[f, years_to_query].sum(axis=1)
                    df_o.loc[dff.index, 'citations'] = dff.values
                else:
                    df_o.loc[f, 'citations'] = np.nan
            else:
                df_o.loc[f, 'citations'] = np.nan
        else:
            df_o.loc[f, 'citations'] = np.nan

    df_o['citations'] = df_o['citations'].astype(float)

    return df_o

In [7]:
df_citations = count_citations(
        medline_wos, earliest_year_after_publication=1, years_to_include=3)

Medline web of science, with citations

In [8]:
df_medline_wos_citations = pd.merge(
    medline_wos[['pubmed_id', 'wos_id', 'pubdate_year', 'j_name_s', 'amount_of_authors']],
    df_citations,
    left_index=True,
    right_index=True,
    how='inner')

In [9]:
df_medline_wos_citations.head()

Unnamed: 0,pubmed_id,wos_id,pubdate_year,j_name_s,amount_of_authors,citations
0,3,A1975AT31100030,1975,Biochem. Biophys. Res. Commun.,2.0,3.0
1,82,A1975AX62800004,1975,Biochim. Biophys. Acta,2.0,7.0
2,154,A1975AW03800008,1975,Circ. Res.,2.0,2.0
3,454,A1975AW34100007,1975,J. Exp. Zool.,2.0,3.0
4,2009,A1976BC88800002,1976,Am. J. Hum. Genet.,2.0,5.0
