In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import natsort
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr


from access_biology_data import meta
from access_economic_data import nih
from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
import nar170604f_occurences as nar
import ana170508f_human_citations as ana
import resci_inout as inout
import resci_tools as rst

import resci_tools as ret

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
taxon_id = 9606

In [None]:
save_images = False
save_tables = True

In [None]:
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(
    taxon_id= taxon_id,
    paper_kind='research',
    ref_genes=ref_genes)

In [None]:
df_m = medline.select_medline_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year
           ''',
    taxon_id = taxon_id,
    kind='research')

df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])]
columns_to_use = ['pubmed_id', 'pubdate_year']

df_m = df_m.loc[:,columns_to_use].drop_duplicates()

# df_m = df_m[df_m['authors']>0]   #                      < ---- have to check with other citation analysis

    
genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes')
df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='inner')


In [None]:
d = pd.merge(
    df_m[['pubmed_id', 'pubdate_year', 'genes']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

is_single_gene_paper = d['genes'] == 1
genes_earliest_years = pd.merge(
    d.loc[
        :,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_year'}),
    d.loc[
        is_single_gene_paper,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_solo_year'}),
    left_on = 'gene_ncbi',
    right_on = 'gene_ncbi',
    how = 'outer'
)

In [None]:
gene_info = meta.gene_info(taxon_id=taxon_id)
gene_info = gene_info[gene_info['gene_ncbi'].isin(ref_genes)]

In [None]:
gene_info_symbols = gene_info[['gene_ncbi', 'symbol_ncbi', 'Synonyms', 'symbol_authority']].copy()

In [None]:
import access_science_shared

In [None]:
gene_info_symbols = rst.split_text_to_multiple_rows(
    gene_info_symbols,
    'Synonyms',
    '\|')

In [None]:
agg = []
for c in ['symbol_ncbi', 'Synonyms', 'symbol_authority']:
    df = gene_info_symbols.loc[:, ['gene_ncbi', c]].rename(
        columns={c: 'symbol'})
    agg.append(df)
gene_info_symbols = pd.concat(agg, axis=0)
gene_info_symbols['symbol'] = gene_info_symbols['symbol'].str.lower()
gene_info_symbols = gene_info_symbols.drop_duplicates()

u = gene_info_symbols['symbol'].value_counts()
gene_info_symbols = gene_info_symbols[gene_info_symbols['symbol'].isin(u[u==1].index)]

In [None]:
name_frame = gene_info_symbols.copy()
name_frame['symbol'] = name_frame['symbol'].str.lower()
name_frame = name_frame.set_index('symbol')

rix = natsort.natsorted(name_frame.index)
name_frame = name_frame.loc[rix, :]
name_frame = name_frame.reset_index().rename(columns={'index':'symbol'})

m = '[0-9]$'
f = name_frame['symbol'].str.contains(m)

name_frame.loc[f, 'number'] = name_frame.loc[f, 'symbol'].str.extract('([0-9]*)$', expand=False)
name_frame.loc[f, 'stem'] = name_frame.loc[f, 'symbol'].str.extract('^(.+?)[0-9]*$', expand=False)
name_frame.loc[f, 'stem'] = name_frame.loc[f, 'stem'].str.rstrip('-._')
name_frame.loc[~f, 'number'] = '0'
name_frame.loc[~f, 'stem'] = name_frame.loc[~f, 'symbol']

name_frame['number'] = name_frame['number'].astype(int)
name_frame=name_frame.drop_duplicates()


In [None]:
name_frame = pd.merge(
    name_frame,
    name_frame['stem'].value_counts().to_frame(
        'genes_in_group').reset_index().rename(columns={'index': 'stem'}),
    left_on='stem',
    right_on='stem',
    how='left'
)

f = name_frame['stem'].str.contains('c[xy0-9]*orf')
name_frame = name_frame.loc[~f, :]

f = name_frame['symbol'].isin(gene_info['symbol_ncbi'].str.lower())
stem_in_main_symbol = name_frame.loc[f, 'stem']
name_frame = name_frame[name_frame['stem'].isin(stem_in_main_symbol)]

forbidden_stems = [
    'kiaa',
]
name_frame = name_frame[~name_frame['stem'].isin(forbidden_stems)]

In [None]:
## Filter for number one and two

# name_frame = name_frame[
#     name_frame['number'].isin([1, 2])
# ]

In [None]:
f = name_frame['genes_in_group'] > 1
m = name_frame.loc[f, ['gene_ncbi', 'stem', 'number']]

In [None]:
m = m.sort_values(['stem', 'number'])

In [None]:
m = m.drop_duplicates(['gene_ncbi', 'stem'], keep='first')

In [None]:
first = m.groupby('stem').nth(0)
second = m.groupby('stem').nth(1)

In [None]:
first.loc[:, 'is_first'] = True
second.loc[:, 'is_second'] = True

In [None]:
m = pd.merge(m, first.reset_index(), how='left')
m = pd.merge(m, second.reset_index(), how='left')

In [None]:
agg = []
df = m.loc[m['is_first'] == True, ['gene_ncbi', 'stem']]
df.loc[:, 'name_order'] = 'first'
agg.append(df)
df = m.loc[m['is_second'] == True, ['gene_ncbi', 'stem']]
df.loc[:, 'name_order'] = 'second'
agg.append(df)
df = pd.concat(agg, 0)
df = df.drop_duplicates()

df = df.pivot(index='stem', columns='name_order',values='gene_ncbi')
df = df[df['first'] != df['second']]
df = df.drop_duplicates()

In [None]:
fame = nar.count_papers_and_attention(ref_genes, gene2pubmed)
fame_of_interest = 'papers'

In [None]:
master = pd.merge(
    df,
    fame[[fame_of_interest]],
    left_on='first',
    right_index=True,
    how='left',
).rename(columns={fame_of_interest: 'fame_first'})

master = pd.merge(
    master,
    fame[[fame_of_interest]],
    left_on='second',
    right_index=True,
    how='left'
).rename(columns={fame_of_interest: 'fame_second'})

In [None]:
master = master.dropna()

In [None]:
gene_info_symbols.head(3)

In [None]:
ginfo_official = gene_info[['gene_ncbi', 'symbol_authority']].copy()
ginfo_official['symbol_authority'] = ginfo_official['symbol_authority'].str.lower()

m = '[0-9]$'
f = ginfo_official['symbol_authority'].str.contains(m)

ginfo_official.loc[f, 'number'] = ginfo_official.loc[f, 'symbol_authority'].str.extract('([0-9]*)$', expand=False)
ginfo_official.loc[f, 'stem'] = ginfo_official.loc[f, 'symbol_authority'].str.extract('^(.+?)[0-9]*$', expand=False)
ginfo_official.loc[f, 'stem'] = ginfo_official.loc[f, 'stem'].str.rstrip('-._')
ginfo_official.loc[~f, 'number'] = '0'
ginfo_official.loc[~f, 'stem'] = ginfo_official.loc[~f, 'symbol_authority']

ginfo_official = ginfo_official[['gene_ncbi', 'stem']].copy()
ginfo_official.loc[:, 'is_official'] = True

In [None]:
master = master.astype(float)
master['fame_diff'] = master['fame_first'] - master['fame_second']

In [None]:
# sns.lmplot(
#     data=master.apply(lambda x: np.log10(x)),
#     x='fame_second',
#     y='fame_first',
#     scatter_kws={'s':3},
#     fit_reg=False)

# if save_images:
#     ret.export_image('170807_ortholog_number_and_knowledge/first_vs_second_{}.pdf'.format(
#         fame_of_interest))

In [None]:
master = pd.merge(
    master.reset_index(),
    ginfo_official,
    left_on=['stem', 'second'],
    right_on=['stem', 'gene_ncbi'],
    how='left')

In [None]:
master = master.drop('gene_ncbi', 1)

In [None]:
master['is_official'] = master['is_official'].fillna(False)

In [None]:
master = master.rename(columns={'is_official': 'second_uses_official'})

In [None]:
master = pd.merge(
    master.reset_index(),
    ginfo_official,
    left_on=['stem', 'first'],
    right_on=['stem', 'gene_ncbi'],
    how='left')
master = master.drop('gene_ncbi', 1)
master['is_official'] = master['is_official'].fillna(False)
master = master.rename(columns={'is_official': 'first_uses_official'})

In [None]:
master = pd.merge(
    master.reset_index(),
    ginfo_official[['gene_ncbi', 'stem']].rename(columns={'stem': 'canonical_stem_first'}),
    left_on=['first'],
    right_on=['gene_ncbi'],
    how='left')
master = master.drop('gene_ncbi', 1)

In [None]:
master = pd.merge(
    master,
    ginfo_official[['gene_ncbi', 'stem']].rename(columns={'stem': 'canonical_stem_second'}),
    left_on=['second'],
    right_on=['gene_ncbi'],
    how='left')
master = master.drop('gene_ncbi', 1)

In [None]:
master['canonical_stem_are_same'] = master['canonical_stem_first'] == master['canonical_stem_second']

In [None]:
master['log_fame_first'] = np.log10(master['fame_first'])
master['log_fame_second'] = np.log10(master['fame_second'])

In [None]:
# master = master.set_index('stem')

In [None]:
master['both_use_official'] = master['first_uses_official'] & master['second_uses_official']

In [None]:
sns.jointplot(
    stat_func=spearmanr,
    data=master[master['canonical_stem_are_same']==True],
    x='log_fame_second',
    y='log_fame_first',
    kind='scatter',
    xlim=(-0.5, 4),
    ylim=(-0.5, 4),
    joint_kws={'s': 7})


if save_images:
    ret.export_image('170810_ortholog_number_and_knowledge/among_canonical_stem_is_same_{}.pdf'.format(
        fame_of_interest))

In [None]:
out = master[master['canonical_stem_are_same']==True][['stem', 'fame_first', 'fame_second']].rename(
columns={'fame_first': 'publications_first', 'fame_second': 'publications_second'})
if save_tables:
    ret.export_full_frame('170810_ortholog_number_and_knowledge/among_canonical_stem_is_same_{}_data.csv'.format(
        fame_of_interest), out, save_index=False)

In [None]:
master

In [None]:
sns.jointplot(
    stat_func=spearmanr,
    data=master[master['canonical_stem_are_same']==False],
    x='log_fame_second',
    y='log_fame_first',
    kind='scatter',
    xlim=(-0.5, 4),
    ylim=(-0.5, 4),
    joint_kws={'s': 7})


if save_images:
    ret.export_image('170810_ortholog_number_and_knowledge/among_canonical_stem_is_different_{}.pdf'.format(
        fame_of_interest))

In [None]:
master[master['canonical_stem_are_same'] == True].corr('spearman')

In [None]:
master[master['canonical_stem_are_same'] == False].corr('spearman')

In [None]:
master.corr('spearman')

In [None]:
master['fold_first_over_second'] = (master['fame_first'] / master['fame_second']).apply(np.log2)

In [None]:
year_category = 'first_year'

In [None]:
# year_category = 'first_solo_year'

In [None]:
master = pd.merge(
    master,
    genes_earliest_years[['gene_ncbi', year_category]],
    left_on='first',
    right_on='gene_ncbi',
    how='left',
)

master = pd.merge(
    master,
    genes_earliest_years[['gene_ncbi', year_category]],
    left_on='second',
    right_on='gene_ncbi',
    how='left'
)

In [None]:
master['year_second_minus_year_first'] = master[year_category + '_y'] - master[year_category + '_x']

In [None]:
# sns.boxplot(x='canonical_stem_are_same', y='year_second_minus_year_first', data=master, notch=True)

# if save_images:
#     ret.export_image('170810_ortholog_number_and_knowledge/years_of_{}.pdf'.format(
#         fame_of_interest))

In [None]:
master.isnull().sum()

In [None]:
# master = master.dropna()    #    <----------

In [None]:
sns.distplot(master.loc[
    master['both_use_official'] == True,    
    'fold_first_over_second'])

In [None]:
sns.distplot(master.loc[
    master['both_use_official'] == False,    
    'fold_first_over_second'])

In [None]:
sns.boxplot(x='canonical_stem_are_same', y='fold_first_over_second', data=master, notch=True)

if save_images:
    ret.export_image('170810_ortholog_number_and_knowledge/enrichment_of_{}.pdf'.format(
        fame_of_interest))

In [None]:
sns.distplot(master['fold_first_over_second'])

In [None]:
master['year_second_minus_year_first'].mean()

In [None]:
master['year_second_minus_year_first'].median()

In [None]:
2**master.loc[master['both_use_official']==True,'fold_first_over_second'].median()

In [None]:
2**master.loc[master['both_use_official']==False,'fold_first_over_second'].median()

In [None]:
master = master.sort_values('year_second_minus_year_first')

In [None]:
sns.distplot(master['year_second_minus_year_first'].dropna(),kde=False)
axes = plt.gca()
axes.set_xlim([-50,50])


if save_images:
    ret.export_image('170810_ortholog_number_and_knowledge/year_difference_{}.pdf'.format(
        year_category))

In [None]:
u = master['year_second_minus_year_first'].dropna()
print(np.count_nonzero(u > 0) / len(u))
print(np.count_nonzero(u == 0) / len(u))
print(np.count_nonzero(u < 0) / len(u))





In [None]:
sns.jointplot(
    x='year_second_minus_year_first',
    y='fold_first_over_second',
    data=master[master['canonical_stem_are_same'] == True],
    joint_kws={'s':2}, stat_func=spearmanr,
    xlim=(-50, 50))




In [None]:
sns.jointplot(
    x='year_second_minus_year_first',
    y='fold_first_over_second',
    data=master[master['canonical_stem_are_same'] == False],
    joint_kws={'s':2}, stat_func=spearmanr,
    xlim=(-50, 50))


In [None]:
sns.jointplot(
    x='year_second_minus_year_first',
    y='fold_first_over_second',
    data=master,joint_kws={'s':2},
    stat_func=spearmanr,
    xlim=(-50, 50))    # note that this will exlude one extreme data pont ( that seems to be wrong in literature)




In [None]:
sns.jointplot(
    x='year_second_minus_year_first',
    y='fold_first_over_second',
    data=master,kind='kde',
    stat_func=spearmanr)

In [None]:
master.corr('spearman')