# Species comparison: preparation

**Purpose**: To prepare datasets for a comparison of cleavage sites between yeast species and human.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

## IMPORTS AND SETUP

In [4]:
from paper_utilities import cleavage

In [5]:
PROJECT   = "/projects/b1080/eks/polyadenylation/yeast"
DATADIR   = os.path.join(PROJECT, 'external_data', 'conservation')
OUTDIR    = os.path.join(PROJECT, 'manuscript', 'analysis', 'species_comparison')
RESOURCES = os.path.join(os.path.dirname(OUTDIR), 'resources')
os.makedirs(OUTDIR, exist_ok = True)


In [6]:
## Load motif family definitions for S. cerevisiae

scer_definitions = {'patterns' : {}, 'distance' : {}}

# By pattern - we only look at pattern families for 6mers
with open(os.path.join(RESOURCES, f'motif_definitions.scer.6mers.patterns.pickle'), mode = 'rb') as handle:
    scer_definitions['patterns'][6] = pickle.load(handle)
    
scer_definitions['patterns'][5] = {'family':{}}
scer_definitions['patterns'][4] = {'family':{}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(RESOURCES, f'motif_definitions.scer.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        scer_definitions['distance'][kmer] = pickle.load(handle)

        
## Load motif family definitions for S. pombe

spom_definitions = {'patterns' : {}, 'distance' : {}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(RESOURCES, f'motif_definitions.spom.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        spom_definitions['distance'][kmer] = pickle.load(handle)


In [7]:
## Load significant motifs for each species

with open(os.path.join(RESOURCES, 'polyaclassifier_motifs.significant_motifs.pickle'), mode = 'rb') as handle:
    significant_motifs = pickle.load(handle)
    
sigmots_scer = significant_motifs['saccharomyces_cerevisiae']['polyaclassifier']['polyaclassifier_bagging3_kmers-6']
sigmots_spom = significant_motifs['schizosaccharomyces_pombe']['polyaclassifier']['polyaclassifier_bagging3_kmers-6']

print(f"Significant motifs: scer={len(sigmots_scer)}, spom={len(sigmots_spom)}")


Significant motifs: scer=137, spom=230


## HELPER FUNCTIONS

In [8]:
def calculate_entropy_cutoffs(data, species, percentiles):
    return data.loc[data['species'] == species, 'observed_entropy'].quantile(percentiles).tolist()


In [9]:
def calculate_relative_position(data):
    
    ##
    
    sites = {}

    for i,row in tqdm.tqdm(data.iterrows()):

        rowkey = (row['gene'],row['chrom'],row['strand'])

        if not (rowkey in sites):
            sites[rowkey] = [row['start']]
        else:
            sites[rowkey].append(row['start'])
            
    ## 
    
    sites_labels = {}

    for rk,pl in sites.items():

        sites_labels[rk] = {}

        if (len(pl) == 1):
            sites_labels[rk][pl[0]] = 'single'

        else:
            spl = sorted(pl)
            g,c,s = rk

            sites_labels[rk][spl[0]] = 'first' if (s == '+') else 'last'
            sites_labels[rk][spl[-1]] = 'last' if (s == '+') else 'first'

            for p in spl[1:-1]:
                sites_labels[rk][p] = 'middle'
    
    return data.apply(lambda row : sites_labels.get((row['gene'],row['chrom'],row['strand']), {}).get(row['start'], 'NA'), axis = 1)


# ANALYSIS

In [10]:
species_order = ['S.cerevisiae','S.pombe','H.sapiens']

species_palette = {
    'S.cerevisiae' : sns.color_palette("Set2")[0],
    'S.pombe'      : sns.color_palette("Set2")[1],
    'H.sapiens'    : sns.color_palette("Set2")[2],
}

## Prepare gene homology information

#### Compile a list of gene names and Ensembl IDs for *H. sapiens* from annotation GTF

In [11]:
hsap_gene_data  = pd.read_csv(os.path.join(PROJECT, 'homo_sapiens', 'reference', 'annotation', 'annotation.biotype.info'), sep = "\t")
hsap_gene_names = hsap_gene_data[['geneId','geneName']].drop_duplicates()
hsap_gene_map   = dict(zip(hsap_gene_names['geneId'], hsap_gene_names['geneName']))


#### Compile a list of gene names and aliases for *S. cerevisiae* from SGD

In [12]:
scer_gene_data = pd.read_csv(os.path.join(DATADIR, 'homologous_genes', 'sgd', 'SGD_features.txt'), sep = '\t')
scer_gene_data = scer_gene_data.loc[scer_gene_data['ParentFeatureName'].str.contains('chromosome') == True]

print(scer_gene_data.shape)
print(scer_gene_data['StandardGeneName'].str.contains("|").sum())

scer_synonyms = {}

for fname, sname, aliases in zip(scer_gene_data['FeatureName'], scer_gene_data['StandardGeneName'], scer_gene_data['Alias']):
    
    scer_synonyms[fname] = fname
    
    if (str(sname) != "nan"):
        for _ in sname.split("|"):
            scer_synonyms[_] = fname
            
    if (str(aliases) != "nan"):
        for _ in aliases.split("|"):
            scer_synonyms[_] = fname
    
print(len(list(scer_synonyms.keys())))
print(list(scer_synonyms.items())[:5])


(8012, 16)
5424
20181
[('YAL069W', 'YAL069W'), ('YAL068W-A', 'YAL068W-A'), ('ARS102', 'ARS102'), ('ARSI-1', 'ARS102'), ('TEL01L', 'TEL01L')]


#### Compile a list of homologous genes according from PomBase

In [13]:
gene_homologs = pd.read_csv(os.path.join(DATADIR, 'homologous_genes', 'pombase', 'pombe-cerevisiae-human-orthologs.txt'), sep = "\t", dtype = str)
print("Input shape     :", gene_homologs.shape)

gene_homologs['cerevisiaeOrthologs'] = gene_homologs['cerevisiaeOrthologs'].apply(lambda x : x.split("|"))
gene_homologs['humanOrthologs']      = gene_homologs['humanOrthologs'].apply(lambda x : x.split("|"))

gene_homologs = gene_homologs.explode('cerevisiaeOrthologs')
gene_homologs = gene_homologs.explode('humanOrthologs')
print("Exploded shape  :", gene_homologs.shape)

## Remove genes where there is not both a S.cer and H.sap ortholog

condC1 = ~(gene_homologs['cerevisiaeOrthologs'].isna())
condC2 = ~(gene_homologs['cerevisiaeOrthologs'] == 'NONE')
condH1 = ~(gene_homologs['humanOrthologs'].isna())
condH2 = ~(gene_homologs['humanOrthologs'] == 'NONE')

gene_homologs = gene_homologs.loc[condC1 & condC2 & condH1 & condH2].copy()
print("No missing shape:", gene_homologs.shape)

with open(os.path.join(RESOURCES, 'homologous_genes.df.pickle'), mode = 'wb') as handle:
    pickle.dump(gene_homologs, handle)


Input shape     : (5121, 3)
Exploded shape  : (22859, 3)
No missing shape: (17068, 3)


In [14]:
homologs_scer = sorted(list([scer_synonyms[x] for x in gene_homologs['cerevisiaeOrthologs'].unique().tolist() if (x in scer_synonyms)]))
homologs_spom = sorted(gene_homologs['pombeGene'].unique().tolist())
homologs_hsap = sorted(gene_homologs['humanOrthologs'].unique().tolist())

print("S. cer:", len(homologs_scer), gene_homologs['cerevisiaeOrthologs'].nunique())
print("S. pom:", len(homologs_spom))
print("H. sap:", len(homologs_hsap))

homologs_lists = {
    'scer' : homologs_scer,
    'spom' : homologs_spom,
    'hsap' : homologs_hsap,
}

with open(os.path.join(RESOURCES, 'homologous_genes.lists.pickle'), mode = 'wb') as handle:
    pickle.dump(homologs_lists, handle)


S. cer: 3406 3410
S. pom: 3262
H. sap: 4086


#### Compile dictionary mapping homologous genes across species

In [15]:
homologs_map = {}

for i,row in gene_homologs.iterrows():
    
    pOrtholog = row['pombeGene']
    cOrtholog = row['cerevisiaeOrthologs']
    hOrtholog = row['humanOrthologs']
            
    if (cOrtholog not in scer_synonyms):
        continue
        
    cOrtholog = scer_synonyms[cOrtholog]

    homologs_map[f'scer_{cOrtholog}'] = {'scer': cOrtholog, 'spom': pOrtholog, 'hsap': hOrtholog}
    homologs_map[f'spom_{pOrtholog}'] = {'scer': cOrtholog, 'spom': pOrtholog, 'hsap': hOrtholog}
    homologs_map[f'hsap_{hOrtholog}'] = {'scer': cOrtholog, 'spom': pOrtholog, 'hsap': hOrtholog}
    
print(len(list(homologs_map.keys())))

with open(os.path.join(RESOURCES, 'homologous_genes.map.pickle'), mode = 'wb') as handle:
    pickle.dump(homologs_map, handle)


10751


#### Calculate conservation in coding regions

In [16]:
cds_scer = pd.read_csv(os.path.join(PROJECT, "saccharomyces_cerevisiae/analysis/conservation/site_conservation.annotation_coding.exons.window_-1.txt"), sep = "\t")
cds_spom = pd.read_csv(os.path.join(PROJECT, "schizosaccharomyces_pombe/analysis/conservation/site_conservation.annotation_coding.exons.window_-1.txt"), sep = "\t")
cds_hsap = pd.read_csv(os.path.join(PROJECT, "homo_sapiens/analysis/conservation/site_conservation.annotation_coding.exons.window_-1.txt"), sep = "\t")

cds_scer['conservation'] = cds_scer['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(", ")]))
cds_spom['conservation'] = cds_spom['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(", ")]))
cds_hsap['conservation'] = cds_hsap['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(", ")]))

cds_score_scer = np.nan_to_num(np.concatenate(cds_scer['conservation'].tolist()), nan=0)
cds_mean_scer  = np.mean(cds_score_scer)
cds_stdv_scer  = np.std(cds_score_scer)

cds_score_spom = np.nan_to_num(np.concatenate(cds_spom['conservation'].tolist()), nan=0)
cds_mean_spom  = np.mean(cds_score_spom)
cds_stdv_spom  = np.std(cds_score_spom)

cds_score_hsap = np.nan_to_num(np.concatenate(cds_hsap['conservation'].tolist()), nan=0)
cds_mean_hsap  = np.mean(cds_score_hsap)
cds_stdv_hsap  = np.std(cds_score_hsap)


  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
print("S.cerevisiae:", np.shape(cds_score_scer), cds_mean_scer, cds_stdv_scer)
print("S.pombe     :", np.shape(cds_score_spom), cds_mean_spom, cds_stdv_spom)
print("H.sapiens   :", np.shape(cds_score_hsap), cds_mean_hsap, cds_stdv_hsap)


S.cerevisiae: (8719615,) 0.7136361543485581 0.3848678468477208
S.pombe     : (7169805,) 2.161310019881989 1.1171600102972692
H.sapiens   : (35861309,) 0.6695558514609706 0.44467385293130146


In [18]:
cds_scores = {
    'scer' : {'mean': cds_mean_scer, 'stdv': cds_stdv_scer},
    'spom' : {'mean': cds_mean_spom, 'stdv': cds_stdv_spom},
    'hsap' : {'mean': cds_mean_hsap, 'stdv': cds_stdv_hsap},
}

with open(os.path.join(RESOURCES, 'conservation.cds_scores.pickle'), mode = 'wb') as handle:
    pickle.dump(cds_scores, handle)


## Conservation meta-analysis around polyA sites

### Prepare conservation data surrounding top polyA sites

#### Identify polyA sites in homologous genes

In [19]:
top1_scer = pd.read_csv(os.path.join(PROJECT, "saccharomyces_cerevisiae/analysis/conservation/site_conservation.golden_sites.top-1.window_500.txt"), sep = "\t")
top1_spom = pd.read_csv(os.path.join(PROJECT, "schizosaccharomyces_pombe/analysis/conservation/site_conservation.golden_sites.top-1.window_500.txt"), sep = "\t")
top1_hsap = pd.read_csv(os.path.join(PROJECT, "homo_sapiens/analysis/conservation/site_conservation.golden_sites.top-1.window_500.txt"), sep = "\t")

top1_scer['species'] = 'S.cerevisiae'
top1_spom['species'] = 'S.pombe'
top1_hsap['species'] = 'H.sapiens'

top1_scer['featureName'] = top1_scer['gene'].apply(lambda x : scer_synonyms.get(x,np.nan))
top1_spom['featureName'] = top1_spom['gene']
top1_hsap['featureName'] = top1_hsap['gene'].apply(lambda x : hsap_gene_map.get(x,np.nan))

top1_scer['conserved_yeast_to_human'] = top1_scer['featureName'].isin(homologs_scer)
top1_spom['conserved_yeast_to_human'] = top1_spom['featureName'].isin(homologs_spom)
top1_hsap['conserved_yeast_to_human'] = top1_hsap['featureName'].isin(homologs_hsap)

print(f"S.cerevisiae: conserved={(top1_scer['conserved_yeast_to_human'] == True).sum()} not_conserved={(top1_scer['conserved_yeast_to_human'] == False).sum()}")
print(f"S.pombe     : conserved={(top1_spom['conserved_yeast_to_human'] == True).sum()} not_conserved={(top1_spom['conserved_yeast_to_human'] == False).sum()}")
print(f"H.sapiens   : conserved={(top1_hsap['conserved_yeast_to_human'] == True).sum()} not_conserved={(top1_hsap['conserved_yeast_to_human'] == False).sum()}")


S.cerevisiae: conserved=3296 not_conserved=2225
S.pombe     : conserved=2720 not_conserved=1260
H.sapiens   : conserved=3777 not_conserved=12325


#### Calculate conservation and entropy surrounding polyA sites

In [20]:
top1_data = pd.concat([top1_scer, top1_spom, top1_hsap], ignore_index = True, sort = False)

top1_data['conservation']     = top1_data['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
top1_data['readvec']          = top1_data['readvec'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
top1_data['observed_norm']    = top1_data['readvec'].apply(lambda x: x / np.sum(x))
top1_data['observed_entropy'] = top1_data['observed_norm'].apply(lambda x : cleavage.calculate_entropy_from_vector(x))

print(top1_data.shape)


(25603, 19)


#### Sort polyA sites into entropy groups

In [21]:
entropy_cutoffs_scer = [0] + calculate_entropy_cutoffs(top1_data, 'S.cerevisiae', [0.2,0.8]) + [4]
entropy_cutoffs_spom = [0] + calculate_entropy_cutoffs(top1_data, 'S.pombe',      [0.2,0.8]) + [4]
entropy_cutoffs_hsap = [0] + calculate_entropy_cutoffs(top1_data, 'H.sapiens',    [0.2,0.8]) + [4]

print(f"Entropy group cutoffs for S.cer: {entropy_cutoffs_scer}")
print(f"Entropy group cutoffs for S.pom: {entropy_cutoffs_spom}")
print(f"Entropy group cutoffs for H.sap: {entropy_cutoffs_hsap}")

top1_data.loc[top1_data['species'] == 'S.cerevisiae', 'observed_entropy_bin'] = pd.cut(top1_data['observed_entropy'], bins = entropy_cutoffs_scer, labels = ['L','M','H'])
top1_data.loc[top1_data['species'] == 'S.pombe',      'observed_entropy_bin'] = pd.cut(top1_data['observed_entropy'], bins = entropy_cutoffs_spom, labels = ['L','M','H'])
top1_data.loc[top1_data['species'] == 'H.sapiens',    'observed_entropy_bin'] = pd.cut(top1_data['observed_entropy'], bins = entropy_cutoffs_hsap, labels = ['L','M','H'])

print("\nEntropy group counts by species:")
print(top1_data.groupby('species')['observed_entropy_bin'].value_counts(sort = False))


Entropy group cutoffs for S.cer: [0, 2.0427309708591617, 2.761882046212095, 4]
Entropy group cutoffs for S.pom: [0, 1.4555478464625482, 2.272820318029907, 4]
Entropy group cutoffs for H.sap: [0, 1.1828515661854353, 2.0065811389742, 4]

Entropy group counts by species:
species       observed_entropy_bin
H.sapiens     H                       3221
              L                       3221
              M                       9660
S.cerevisiae  H                       1104
              L                       1105
              M                       3312
S.pombe       H                        796
              L                        796
              M                       2388
Name: observed_entropy_bin, dtype: int64


#### Save data for downstream analysis

In [22]:
with open(os.path.join(RESOURCES, 'conservation.top1_sites.pickle'), mode = 'wb') as handle:
    pickle.dump(top1_data, handle)


### Prepare conservation data surrounding golden polyA sites

#### Identify polyA sites in homologous genes

In [23]:
gold_scer = pd.read_csv(os.path.join(PROJECT, "saccharomyces_cerevisiae/analysis/conservation/site_conservation.golden_sites.subset_reads.window_500.txt"), sep = "\t")
gold_spom = pd.read_csv(os.path.join(PROJECT, "schizosaccharomyces_pombe/analysis/conservation/site_conservation.golden_sites.subset_reads.window_500.txt"), sep = "\t")
gold_hsap = pd.read_csv(os.path.join(PROJECT, "homo_sapiens/analysis/conservation/site_conservation.golden_sites.subset_reads.window_500.txt"), sep = "\t")

gold_scer = gold_scer.loc[gold_scer['feature'].str.contains('utr3')].copy()
gold_spom = gold_spom.loc[gold_spom['feature'].str.contains('utr3')].copy()
gold_hsap = gold_hsap.loc[gold_hsap['feature'].str.contains('terminal_exon')].copy()

gold_scer['species'] = 'S.cerevisiae'
gold_spom['species'] = 'S.pombe'
gold_hsap['species'] = 'H.sapiens'

gold_scer['featureName'] = gold_scer['gene'].apply(lambda x : scer_synonyms.get(x,np.nan))
gold_spom['featureName'] = gold_spom['gene']
gold_hsap['featureName'] = gold_hsap['gene'].apply(lambda x : hsap_gene_map.get(x,np.nan))

gold_scer['conserved_yeast_to_human'] = gold_scer['featureName'].isin(homologs_scer)
gold_spom['conserved_yeast_to_human'] = gold_spom['featureName'].isin(homologs_spom)
gold_hsap['conserved_yeast_to_human'] = gold_hsap['featureName'].isin(homologs_hsap)

print("S.cerevisiae:", gold_scer.shape, (gold_scer['conserved_yeast_to_human'] == True).sum(), (gold_scer['conserved_yeast_to_human'] == False).sum())
print("S.pombe     :", gold_spom.shape, (gold_spom['conserved_yeast_to_human'] == True).sum(), (gold_spom['conserved_yeast_to_human'] == False).sum())
print("H.sapiens   :", gold_hsap.shape, (gold_hsap['conserved_yeast_to_human'] == True).sum(), (gold_hsap['conserved_yeast_to_human'] == False).sum())


S.cerevisiae: (11033, 17) 7619 3414
S.pombe     : (2387, 17) 1848 539
H.sapiens   : (19377, 17) 5563 13814


#### Category polyA sites based on relative position in the gene

In [24]:
gold_scer['position'] = calculate_relative_position(gold_scer)
print(gold_scer['position'].value_counts())

gold_spom['position'] = calculate_relative_position(gold_spom)
print(gold_spom['position'].value_counts())

gold_hsap['position'] = calculate_relative_position(gold_hsap)
print(gold_hsap['position'].value_counts())


11033it [00:01, 9578.07it/s]


middle    4393
first     2740
last      2740
single    1160
Name: position, dtype: int64


2387it [00:00, 9543.20it/s]


single    825
last      553
first     553
middle    456
Name: position, dtype: int64


19377it [00:02, 9612.33it/s]


single    7436
first     4479
last      4479
middle    2983
Name: position, dtype: int64


#### Calculate conservation and entropy surrounding polyA sites

In [25]:
gold_data = pd.concat([gold_scer, gold_spom, gold_hsap], ignore_index = True, sort = False)

gold_data['conservation']     = gold_data['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
gold_data['readvec']          = gold_data['readvec'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
gold_data['observed_norm']    = gold_data['readvec'].apply(lambda x: x / np.sum(x))
gold_data['observed_entropy'] = gold_data['observed_norm'].apply(lambda x : cleavage.calculate_entropy_from_vector(x))

print(gold_data.shape)


(32797, 20)


#### Save data for downstream analysis

In [26]:
with open(os.path.join(RESOURCES, 'conservation.gold_sites.pickle'), mode = 'wb') as handle:
    pickle.dump(gold_data, handle)


## Conservation of motifs important from PolyaClassifier

### Prepare conservation data surrounding top polyA sites

#### Identify motifs surrounding top sites in homologous genes

In [27]:
mot_top1_scer = pd.read_csv(os.path.join(PROJECT, "saccharomyces_cerevisiae/analysis/conservation/site_conservation.golden_motifs.top-1.window_-1.txt"), sep = "\t").rename(columns = {'siteKey' : 'posKey'})
mot_top1_spom = pd.read_csv(os.path.join(PROJECT, "schizosaccharomyces_pombe/analysis/conservation/site_conservation.golden_motifs.top-1.window_-1.txt"), sep = "\t").rename(columns = {'siteKey' : 'posKey'})

mot_top1_scer['motif'] = mot_top1_scer['label'].str.split("|").str[0]
mot_top1_spom['motif'] = mot_top1_spom['label'].str.split("|").str[0]

mot_top1_scer['motifRelPos'] = mot_top1_scer['label'].str.split("|").str[2].astype(int)
mot_top1_spom['motifRelPos'] = mot_top1_spom['label'].str.split("|").str[2].astype(int)

mot_top1_scer['siteKey'] = mot_top1_scer['label'].apply(lambda x : ":".join(x.split("|")[1].split(":")[1:4]))
mot_top1_spom['siteKey'] = mot_top1_spom['label'].apply(lambda x : ":".join(x.split("|")[1].split(":")[1:4]))

mot_top1_scer['species'] = 'S.cerevisiae'
mot_top1_spom['species'] = 'S.pombe'

gene_dict_scer = dict(zip(top1_scer['siteKey'], top1_scer['featureName']))
gene_dict_spom = dict(zip(top1_spom['siteKey'], top1_spom['featureName']))

mot_top1_scer['featureName'] = mot_top1_scer['siteKey'].apply(lambda x : gene_dict_scer.get(x,np.nan))
mot_top1_spom['featureName'] = mot_top1_spom['siteKey'].apply(lambda x : gene_dict_spom.get(x,np.nan))

mot_top1_scer['conserved_yeast_to_human'] = mot_top1_scer['featureName'].isin(homologs_scer)
mot_top1_spom['conserved_yeast_to_human'] = mot_top1_spom['featureName'].isin(homologs_spom)

print("S.cerevisiae:", mot_top1_scer.shape, (mot_top1_scer['conserved_yeast_to_human'] == True).sum(), (mot_top1_scer['conserved_yeast_to_human'] == False).sum())
print("S.pombe     :", mot_top1_spom.shape, (mot_top1_spom['conserved_yeast_to_human'] == True).sum(), (mot_top1_spom['conserved_yeast_to_human'] == False).sum())


S.cerevisiae: (2732895, 14) 1631520 1101375
S.pombe     : (1970100, 14) 1346400 623700


#### Categorize motifs by family and significance

In [28]:
mot_top1_scer['motifOverallFamily'] = mot_top1_scer['motif'].apply(lambda x : scer_definitions['distance'][len(x)]['family'].get(x,'Other'))
mot_top1_spom['motifOverallFamily'] = mot_top1_spom['motif'].apply(lambda x : spom_definitions['distance'][len(x)]['family'].get(x,'Other'))

mot_top1_scer['motifHammingFamily'] = mot_top1_scer['motif'].apply(lambda x : scer_definitions['distance'][len(x)]['hamming'].get(x,'Other'))
mot_top1_spom['motifHammingFamily'] = mot_top1_spom['motif'].apply(lambda x : spom_definitions['distance'][len(x)]['hamming'].get(x,'Other'))

mot_top1_scer['motifSignificance'] = mot_top1_scer['motif'].isin(sigmots_scer)
mot_top1_spom['motifSignificance'] = mot_top1_spom['motif'].isin(sigmots_spom)


#### Calculate mean conservation of each motif

In [29]:
mot_top1_data = pd.concat([mot_top1_scer, mot_top1_spom], ignore_index = True, sort = False)

mot_top1_data['conservation']         = mot_top1_data['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
mot_top1_data['conservation_mean']    = mot_top1_data['conservation'].apply(lambda x : np.mean(np.nan_to_num(x)))
mot_top1_data['conservation_missing'] = mot_top1_data['conservation'].apply(lambda x : np.isnan(x).any())

mot_top1_data.drop(columns = ['conservation'], inplace = True)

print(mot_top1_data.shape)


(4702995, 18)


#### Save data for downstream analysis

In [30]:
with open(os.path.join(RESOURCES, 'conservation.top1_motifs.pickle'), mode = 'wb') as handle:
    pickle.dump(mot_top1_data, handle)


### Prepare conservation data surrounding golden polyA sites

#### Identify motifs surrounding top sites in homologous genes

In [31]:
mot_gold_scer = pd.read_csv(os.path.join(PROJECT, "saccharomyces_cerevisiae/analysis/conservation/site_conservation.golden_motifs.subset_reads.window_-1.txt"), sep = "\t").rename(columns = {'siteKey' : 'posKey'})
mot_gold_spom = pd.read_csv(os.path.join(PROJECT, "schizosaccharomyces_pombe/analysis/conservation/site_conservation.golden_motifs.subset_reads.window_-1.txt"), sep = "\t").rename(columns = {'siteKey' : 'posKey'})

mot_gold_scer['motif'] = mot_gold_scer['label'].str.split("|").str[0]
mot_gold_spom['motif'] = mot_gold_spom['label'].str.split("|").str[0]

mot_gold_scer['motifRelPos'] = mot_gold_scer['label'].str.split("|").str[2].astype(int)
mot_gold_spom['motifRelPos'] = mot_gold_spom['label'].str.split("|").str[2].astype(int)

mot_gold_scer['siteKey'] = mot_gold_scer['label'].apply(lambda x : ":".join(x.split("|")[1].split(":")[1:4]))
mot_gold_spom['siteKey'] = mot_gold_spom['label'].apply(lambda x : ":".join(x.split("|")[1].split(":")[1:4]))

mot_gold_scer['species'] = 'S.cerevisiae'
mot_gold_spom['species'] = 'S.pombe'

gene_dict_scer = dict(zip(gold_scer['siteKey'], gold_scer['featureName']))
gene_dict_spom = dict(zip(gold_spom['siteKey'], gold_spom['featureName']))

mot_gold_scer['featureName'] = mot_gold_scer['siteKey'].apply(lambda x : gene_dict_scer.get(x,np.nan))
mot_gold_spom['featureName'] = mot_gold_spom['siteKey'].apply(lambda x : gene_dict_spom.get(x,np.nan))

mot_gold_scer['conserved_yeast_to_human'] = mot_gold_scer['featureName'].isin(homologs_scer)
mot_gold_spom['conserved_yeast_to_human'] = mot_gold_spom['featureName'].isin(homologs_spom)

print("S.cerevisiae:", mot_gold_scer.shape, (mot_gold_scer['conserved_yeast_to_human'] == True).sum(), (mot_gold_scer['conserved_yeast_to_human'] == False).sum())
print("S.pombe     :", mot_gold_spom.shape, (mot_gold_spom['conserved_yeast_to_human'] == True).sum(), (mot_gold_spom['conserved_yeast_to_human'] == False).sum())


S.cerevisiae: (5778135, 14) 3771405 2006730
S.pombe     : (1233540, 14) 914760 318780


#### Categorize motifs by family and significance

In [32]:
mot_gold_scer['motifOverallFamily'] = mot_gold_scer['motif'].apply(lambda x : scer_definitions['distance'][len(x)]['family'].get(x,'Other'))
mot_gold_spom['motifOverallFamily'] = mot_gold_spom['motif'].apply(lambda x : spom_definitions['distance'][len(x)]['family'].get(x,'Other'))

mot_gold_scer['motifHammingFamily'] = mot_gold_scer['motif'].apply(lambda x : scer_definitions['distance'][len(x)]['hamming'].get(x,'Other'))
mot_gold_spom['motifHammingFamily'] = mot_gold_spom['motif'].apply(lambda x : spom_definitions['distance'][len(x)]['hamming'].get(x,'Other'))

mot_gold_scer['motifSignificance'] = mot_gold_scer['motif'].isin(sigmots_scer)
mot_gold_spom['motifSignificance'] = mot_gold_spom['motif'].isin(sigmots_spom)


#### Calculate mean conservation of each motif

In [33]:
mot_gold_data = pd.concat([mot_gold_scer, mot_gold_spom], ignore_index = True, sort = False)

mot_gold_data['conservation']         = mot_gold_data['conservation'].apply(lambda x : np.asarray([float(_) for _ in x.strip("][").split(",")]))
mot_gold_data['conservation_mean']    = mot_gold_data['conservation'].apply(lambda x : np.mean(np.nan_to_num(x)))
mot_gold_data['conservation_missing'] = mot_gold_data['conservation'].apply(lambda x : np.isnan(x).any())

print(mot_gold_data.shape)


(7011675, 19)


#### Save data for downstream analysis

In [34]:
with open(os.path.join(RESOURCES, 'conservation.gold_motifs.pickle'), mode = 'wb') as handle:
    pickle.dump(mot_gold_data, handle)
