In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0,'../../')

In [3]:
import numpy as np
import pandas as pd
from tabulate import tabulate

pd.set_option('precision', 3)

In [4]:
def get_scores(model):
    df = pd.read_table('../../models/%s/scores.tsv' % model)
    df = df[['Class', 'AP']]
    df = df.set_index('Class')
    df.index.name = 'term'
    
    return df

In [5]:
models = [('DBOW', 'doc2vec/dbow_nn'),
          ('DMm', 'doc2vec/dm_mean_nn'),
          ('DMc', 'doc2vec/dm_concat_nn'),
          ('DMs', 'doc2vec/dm_sum_nn'),
          ('CNN', 'nb_filters/fl=1000_lens=4_act=tanh_pool_s=1_kmax=None_wv_d=0.0_pool_d=0.0_conv_bn=0_pool_bn=0')]

In [6]:
dfs = []

for name, model in models:
    df = get_scores(model)
    df = df.rename(columns={'AP': name})
    dfs.append(df)
    
scores = pd.concat(dfs, axis=1, join='inner')

In [7]:
scores.head()

Unnamed: 0_level_0,DBOW,DMm,DMc,DMs,CNN
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3T3 Cells,0.051,0.03,0.009,0.015,0.278
Abdomen,0.019,0.065,0.022,0.028,0.194
Abdominal Pain,0.037,0.065,0.024,0.039,0.278
"Abnormalities, Multiple",0.208,0.06,0.026,0.072,0.363
Abscess,0.099,0.065,0.028,0.044,0.462


In [8]:
scores_mean = scores.mean()
scores_mean

DBOW    0.139
DMm     0.141
DMc     0.056
DMs     0.093
CNN     0.442
dtype: float64

In [9]:
scores_mean.to_csv('results/mean.tsv', sep='\t', float_format='%.3f')

In [10]:
categories = [l.rstrip('\n').split('\t')[1]
              for l in open('categories.tsv')]
df = pd.read_table('term_categories.tsv', index_col=0)

In [11]:
groups_scores = []

for category in categories:
    try:
        df_group = df.groupby([category]).get_group(1)
    except KeyError:
        print(category)
        continue
    
    group_scores = []
    scores_subset = scores.loc[df_group.index]
    scores_mean = scores_subset.mean()
    scores_mean.name = category
    groups_scores.append(scores_mean)

Publication Characteristics


In [12]:
groups_scores = pd.concat(groups_scores,  axis=1, join='inner')
groups_scores = groups_scores.transpose()
groups_scores.index.name = 'category'
groups_scores

Unnamed: 0_level_0,DBOW,DMm,DMc,DMs,CNN
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anatomy,0.14,0.152,0.052,0.095,0.464
Organisms,0.174,0.171,0.068,0.11,0.571
Diseases,0.158,0.17,0.066,0.117,0.54
Chemicals and Drugs,0.133,0.148,0.049,0.092,0.488
"Analytical), Diagnostic and Therapeutic Techniques and Equipment",0.137,0.12,0.055,0.085,0.37
Psychiatry and Psychology,0.131,0.132,0.06,0.091,0.397
Biological Sciences,0.148,0.142,0.061,0.096,0.385
Physical Sciences,0.098,0.089,0.039,0.061,0.28
"Anthropology), Education), Sociology and Social Phenomena",0.109,0.12,0.05,0.078,0.358
Technology and Food and Beverages,0.148,0.142,0.049,0.092,0.431


In [13]:
groups_scores.to_csv('results/categories.tsv', sep='\t', float_format='%.3f')

In [14]:
check_tags = [l.rstrip('\n') for l in open('check_tags.txt')]

In [15]:
checktags_scores = scores[df.index.isin(check_tags)]
checktags_scores

Unnamed: 0_level_0,DBOW,DMm,DMc,DMs,CNN
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adolescent,0.316,0.326,0.21,0.215,0.611
Adult,0.513,0.504,0.376,0.375,0.784
Aged,0.467,0.466,0.319,0.318,0.738
"Aged, 80 And Over",0.223,0.235,0.144,0.146,0.472
Animals,0.765,0.758,0.527,0.56,0.966
Cats,0.26,0.309,0.206,0.178,0.864
Cattle,0.297,0.304,0.101,0.2,0.773
Chick Embryo,0.185,0.159,0.065,0.106,0.728
Child,0.35,0.39,0.262,0.265,0.73
"Child, Preschool",0.268,0.299,0.197,0.195,0.65


In [16]:
checktags_scores.to_csv('results/check_tags.tsv', sep='\t', float_format='%.3f')

In [17]:
from medline.data import read_test_labels

Y = read_test_labels()
supports = Y.sum(axis=0)
supports = np.asarray(supports)[0]

df['Support'] = supports

In [18]:
scores = scores[['DBOW', 'DMm', 'CNN']]

In [19]:
for category in categories:
    try:
        df_group = df.groupby([category]).get_group(1)
    except KeyError:
        continue

    df_group = df_group.sort_values('Support', ascending=False)[:10]
    
    scores_subset = scores.loc[df_group.index]
    scores_subset.insert(0, 'support', df_group.Support)
    
    print(scores_subset)
    scores_subset.to_csv('results/categories/%s.tsv' % category, sep='\t', float_format='%.3f')

                       support   DBOW    DMm    CNN
term                                               
Cells, Cultured          87497  0.243  0.251  0.493
Cell Line                67073  0.175  0.168  0.392
Brain                    62719  0.209  0.232  0.500
Liver                    52759  0.294  0.333  0.653
Cell Line, Tumor         41202  0.353  0.304  0.573
Neurons                  39709  0.279  0.294  0.564
Kidney                   30924  0.217  0.247  0.487
Tumor Cells, Cultured    30714  0.160  0.154  0.393
Lung                     26243  0.208  0.271  0.520
Cell Membrane            25618  0.134  0.156  0.337
                      support   DBOW    DMm    CNN
term                                              
Humans                1911332  0.934  0.917  0.989
Animals                885550  0.765  0.758  0.966
Rats                   229068  0.511  0.457  0.929
Mice                   220522  0.483  0.458  0.887
Rats, Sprague-Dawley    53179  0.219  0.189  0.585
Cattle             