In [18]:
import json
import pandas as pd
import numpy as np
from yome import Session
from yome.models import *
from yome.util import to_df, report
import re
from sqlalchemy import or_, and_
from sqlalchemy.orm import aliased
import itertools as it
import seaborn as sns
from tqdm import tqdm
from collections import Counter

In [19]:
from mpl_recipes import mpl_setup
%mpl_setup

Populating the interactive namespace from numpy and matplotlib


In [20]:
session = Session()

In [21]:
def calculate_word_freq(series):
    return None

In [22]:
# get all features, ignoring 't' and 'f'
features = to_df(
    session.query(
        Gene.locus_id,
        KnowledgebaseGene.primary_name,
        KnowledgebaseFeature.feature_type,
        KnowledgebaseFeature.feature,
    )
    .join(KnowledgebaseGene)
    .join(Knowledgebase)
    .join(KnowledgebaseFeature)
    .filter(KnowledgebaseFeature.feature.notin_(['t', 'f', '']))
    .filter(KnowledgebaseFeature.feature_type.notin_(['summary_html']))
)

In [23]:
features.head()

Unnamed: 0,locus_id,primary_name,feature_type,feature
0,b2331,smrB,description,putative endonuclease SmrB
1,b2331,smrB,product_type,polypeptide
2,b2038,rfbC,description,"dTDP-4-dehydrorhamnose 3,5-epimerase"
3,b2038,rfbC,product_type,enzyme
4,b2038,rfbC,ec_number,5.1.3.13


In [37]:
# get y-ome list
yome = to_df(
    session.query(
        Gene.locus_id,
        KnowledgebaseGene.annotation_quality
    )
    .join(KnowledgebaseGene)
    .join(Knowledgebase)
    .filter(Knowledgebase.name == 'Y-ome')
)

In [39]:
yome_high = yome[yome.annotation_quality == 'high']
yome_low = yome[yome.annotation_quality == 'low']
yome_excluded = yome[yome.annotation_quality == 'excluded']

In [25]:
def word_map(word):
    return re.subn(r'[^a-zA-Z]', '', word)[0]

def word_filter(word):
    return len(word) >= 5

In [26]:
# get all unique words from query
all_words = [item for sublist in features.feature.values for item in sublist.split()]
# filter out short and non-ASCII words
filtered_words = list(filter(word_filter, map(word_map, all_words)))

In [27]:
# only check common words
most_common = [word for word, count in Counter(filtered_words).most_common() if count > 10]

In [28]:
words_df_common = pd.DataFrame(most_common, columns=['word'])

In [30]:
yome = ['b1779']

In [32]:
# count unique genes matching words
features_agg = features.loc[:, ['locus_id', 'feature']].groupby('locus_id').agg(lambda x: ' '.join(x)).reset_index()

In [41]:
features_agg_yome_high = features_agg[features_agg.locus_id.isin(yome_high.locus_id)]
features_agg_yome_low = features_agg[features_agg.locus_id.isin(yome_low.locus_id)]
features_agg_yome_excluded = features_agg[features_agg.locus_id.isin(yome_excluded.locus_id)]

In [42]:
# progress bar
tqdm.pandas()

In [43]:
words_df_common.loc[:, 'count_high'] = words_df_common.loc[:, 'word'].progress_apply(lambda x: features_agg_yome_high.feature.str.contains(r'\b' + x + r'\b').sum())
words_df_common.loc[:, 'count_low'] = words_df_common.loc[:, 'word'].progress_apply(lambda x: features_agg_yome_low.feature.str.contains(r'\b' + x + r'\b').sum())
words_df_common.loc[:, 'count_excluded'] = words_df_common.loc[:, 'word'].progress_apply(lambda x: features_agg_yome_excluded.feature.str.contains(r'\b' + x + r'\b').sum())

100%|██████████| 2945/2945 [06:31<00:00,  7.25it/s]
100%|██████████| 2945/2945 [01:40<00:00, 29.22it/s]
100%|██████████| 2945/2945 [00:17<00:00, 172.67it/s]


In [46]:
words_df_common.set_index('word').sort_values(['count_low', 'count_high', 'count_excluded'], ascending=False)
words_df_common[words_df_common]

Unnamed: 0_level_0,count_high,count_low,count_excluded
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chain,2614,1506,167
polypeptide,1102,1487,168
protein,2276,1266,184
putative,463,1005,116
function,436,830,30
unknown,117,787,17
Uncharacterized,22,698,11
family,262,656,89
Putative,377,620,148
domain,604,536,33
