In [None]:
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

import glob

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr

from access_biology_data import meta
from access_literature_data import medline
from access_science_shared import standardizer

import sys
sys.path.append('./../src/')
import nar170604f_occurences as nar_attention
import resci_tools as ret

In [None]:
save_images = True
save_media = True

In [None]:
taxon_id = 9606

In [None]:
first_year_of_past_block = 1
first_year_of_current_block = 2011
last_year_of_past_block = first_year_of_current_block - 1
last_year_of_current_block = 2015

In [None]:
ref_genes = standardizer.reference_genes(taxon_id, 'orp')
ref_gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research',ref_genes=ref_genes)

papers = nar_attention.count_papers_and_attention(ref_genes, ref_gene2pubmed)

In [None]:
df_m = medline.select_medline_wos_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            medline.amount_of_authors,
            medline.j_name_s''',
    years_range='all',
    taxon_id=taxon_id,
    kind='research',
    unambiguous=True)

In [None]:
past_block = nar_attention.count_papers_and_attention(
    ref_genes,
    ref_gene2pubmed[ref_gene2pubmed['pubmed_id'].isin(
        df_m[df_m['pubdate_year'].isin(
            np.arange(first_year_of_past_block, last_year_of_past_block+1)
        )]['pubmed_id'])])[['papers']].rename(columns={'papers': 'past'})

In [None]:
current_block = nar_attention.count_papers_and_attention(
    ref_genes,
    ref_gene2pubmed[ref_gene2pubmed['pubmed_id'].isin(
        df_m[df_m['pubdate_year'].isin(
            np.arange(first_year_of_current_block, last_year_of_current_block)
        )]['pubmed_id'])])[['papers']].rename(columns={'papers': 'current'})

In [None]:
master = pd.concat([past_block, current_block], axis=1)
master['current_norm'] = master['current'] / master['current'].sum() * master['past'].sum()
master['past_norm'] = master['past'] / master['past'].sum() * master['current'].sum()
master['ratio'] = np.log2(master['current'] / master['past_norm'])

In [None]:
print(
    'The spearman correlation of past and current is:',
    spearmanr(master['past'], master['current'])[0])

In [None]:
formatted_master = master.copy()

In [None]:
v = ['current_norm', 'past_norm', 'current', 'past']
value_for_zero_occurences = 0.3
for c in v:
    f = (formatted_master[c] == 0)
    formatted_master.loc[f, c] = value_for_zero_occurences

In [None]:
v = ['current_norm', 'past_norm', 'current', 'past']
for c in v:
    n = 'log_{}'.format(c)
    formatted_master[n] = formatted_master[c].apply(lambda x: np.log10(x))

In [None]:
formatted_master['geometric_mean'] = (
    formatted_master['log_past_norm'] + formatted_master['log_current']) / 2

In [None]:
formatted_master['offset_to_geometric'] = np.sqrt(
    (formatted_master['log_current'] - formatted_master['geometric_mean'])**2 + (
    formatted_master['log_past_norm'] - formatted_master['geometric_mean'])**2 )

In [None]:
std_in_bin = pd.DataFrame(index=np.arange(0.2, 4, 0.1))
window_size = 0.3
for i in std_in_bin.index:
    f1 = formatted_master['geometric_mean'] > (i-window_size)
    f2 = formatted_master['geometric_mean'] < (i+window_size)
    f = f1 & f2
    
    std_in_bin.loc[i, 'smoothened_std'] = np.std(
        formatted_master.loc[f, 'offset_to_geometric'])
    std_in_bin.loc[i, 'center'] = i

In [None]:
std_in_bin['shift'] = std_in_bin['smoothened_std'] / np.sqrt(2)

In [None]:
num_sigma = 3
std_in_bin['u_x'] = std_in_bin['center'] - num_sigma * std_in_bin['shift']
std_in_bin['u_y'] = std_in_bin['center'] + num_sigma * std_in_bin['shift']
std_in_bin['l_x'] = std_in_bin['center'] + num_sigma * std_in_bin['shift']
std_in_bin['l_y'] = std_in_bin['center'] - num_sigma * std_in_bin['shift']

std_in_bin = std_in_bin.dropna()

In [None]:
sns.lmplot(
    x='log_past_norm',
    y='log_current',
    data=formatted_master,
    scatter_kws={"s": 1},
    fit_reg=False)

plt.plot(std_in_bin['u_x'], std_in_bin['u_y'], 'k-')
plt.plot(std_in_bin['l_x'], std_in_bin['l_y'], 'k-')

if save_images:
    ret.export_image('180103_comparision_contemporary_vs_past/unannotated_scatter.pdf')

In [None]:
gi = meta.gene_info(taxon_id, usecols=['gene_ncbi', 'symbol_ncbi']).set_index('gene_ncbi', verify_integrity=True)

s = 8
x_l = 'log_past_norm'
y_l = 'log_current'

z = formatted_master[x_l].values
y = formatted_master[y_l].values
n = gi.loc[formatted_master.index]['symbol_ncbi'].values

fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(1, 1, 1)

ax.scatter(z, y, edgecolors='k',s=1)

plt.xlabel(x_l)
plt.ylabel(y_l)

for i, txt in enumerate(n):
    ax.annotate(txt, xy=(z[i],y[i]), xytext=(z[i]+0.001,y[i]+0.001), fontsize=s)

plt.title('first year of current: {}'.format(first_year_of_current_block))

if save_images:
    ret.export_image('180103_comparision_contemporary_vs_past/annotated_scatter.pdf')

In [None]:
from access_biology_data import annotation
from nltk.corpus import stopwords
from access_biology_data import meta

# get gene RIFs
gene_rif = annotation.generif(taxon_id)
gene_rif.loc[:,'clean'] = gene_rif.loc[:,'GeneRIF text'].str.lower()
gene_rif.loc[:,'clean'] = gene_rif.loc[:,'clean'].str.replace('\(huge navigator\)','') 
gene_rif.loc[:,'clean'] = gene_rif.loc[:,'clean'].str.replace('\.','')
gene_rif = gene_rif.drop_duplicates(['PubMed ID (PMID) list', 'gene_ncbi','clean'])
gene_rif['clean'] = gene_rif['clean'] + ' '
gene_rif = gene_rif.rename(columns={
    'PubMed ID (PMID) list': 'pubmed_id'
})
gene_rif = gene_rif[['gene_ncbi', 'pubmed_id', 'clean']]
gene_rif['pubmed_id'] = gene_rif['pubmed_id'].astype(float)
gene_rif = gene_rif.drop_duplicates()

In [None]:
gene_rif = pd.merge(
    gene_rif,
    df_m[['pubmed_id', 'pubdate_year']],
    how='left'
)

In [None]:
gene_rif = gene_rif.dropna()    # drop statements where no year can be allocated (research statement)
gene_rif = gene_rif[gene_rif['gene_ncbi'].isin(ref_genes)] # filter for reference genes

In [None]:
def remove_char(t):
    t = [x.strip('0123456789/-[]{}+-&*,()%;:><" ') for x in t]
    t = [x.strip("0123456789/-[]{}+-&*,()%;:><' ") for x in t]
    return t

def multi_remove_char(t):
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    t = remove_char(t)
    return t


# define stopwords
stops = stopwords.words('english')  # standard stop words
gi = meta.gene_info(taxon_id)
gene_symbols = list(gi['symbol_ncbi'].str.lower().values)
gene_alternative_symbols = list(ret.split_text_to_multiple_rows(
    gi[['gene_ncbi', 'Synonyms']],
    column='Synonyms',
    separator='\|')['Synonyms'].str.lower().unique())
numbers = [str(x) for x in np.arange(1, 10000)]
continuous_gene_names = list(gi.loc[~gi['description'].str.contains(' '), 'description'].unique())
alphabet = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
special_characters = ['/']
further_forbidden = [
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'
]
forbidden_names = stops + \
gene_symbols + \
gene_alternative_symbols + \
numbers + \
continuous_gene_names + \
alphabet + \
special_characters

shortened_forbidden = multi_remove_char(forbidden_names)
forbidden_names = forbidden_names + shortened_forbidden

In [None]:
years_to_consider_for_linguistic_analysis = np.arange(
    first_year_of_current_block, last_year_of_current_block + 1)

linguist = pd.merge(
    master.reset_index(),
        gene_rif[gene_rif['pubdate_year'].isin(
        years_to_consider_for_linguistic_analysis
    )]
)


In [None]:
linguist['has_transitioned'] = linguist['current'] > linguist['past_norm']

In [None]:
def get_words(in_series, label):
    """
    label e.g.: 'transitioned'
    """
    all_text = ''.join(list(in_series.values))
    split_text = all_text.split()
    
    split_text = multi_remove_char(split_text)

    split_text = np.array(split_text)
    f = split_text == ''
    split_text = split_text[~f]
    
    f = np.array([len(x)>=4 for x in split_text])
    split_text = split_text[f]
    
    split_text = list(split_text)
    
    
    a = pd.Series(split_text).value_counts().to_frame(label)
    f = a.index.isin(forbidden_names)
    words = a.loc[~f, :]
    word_frequency = words / words.sum()
    return word_frequency

In [None]:
words_transitioned = get_words(
    linguist.loc[linguist['has_transitioned'], 'clean'], 'transitioned')
words_not_transitioned = get_words(
    linguist.loc[~linguist['has_transitioned'], 'clean'], 'not_transitioned')

comb = pd.merge(
    words_transitioned.reset_index(),
    words_not_transitioned.reset_index()
).fillna(0)

comb.loc[:, 'ratio'] = np.log2(
    comb['transitioned']/comb['not_transitioned'])

In [None]:
out_table = comb[comb['transitioned'] > 0.001].sort_values('ratio', ascending=False).rename(columns={
    'index': 'word',
    'ratio': 'fold enrichment'
})

In [None]:
out_table.head()

In [None]:
if save_media:
    ret.export_full_frame(
        '180103_comparision_contemporary_vs_past/enrichment_words.xlsx',
        out_table)

In [None]:
enrichment_to_export = pd.merge(
    master[['past_norm', 'current', 'ratio']].sort_values('ratio', ascending=False).rename(
        columns={
            'current': '2011-2015',
            'past_norm': 'till 2010 (normalized)',
            'ratio': 'fold change'
        }).reset_index(),
    gi[['gene_ncbi', 'symbol_ncbi']])

In [None]:
enrichment_to_export.head()

In [None]:
if save_media:
    ret.export_full_frame(
        '180103_comparision_contemporary_vs_past/enrichment_genes.xlsx',
        enrichment_to_export)