# Tables of polyA sites for each species

**Purpose**: To compile a reference table of clustered cleavage sites, referred to as polyA sites, for each species.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

## IMPORTS AND SETUP

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT   = "/projects/b1080/eks/polyadenylation/yeast/"
OUTDIR    = os.path.join(PROJECT, 'manuscript', 'analysis', 'polya_sites_tables')
RESOURCES = os.path.join(os.path.dirname(OUTDIR), 'resources')
os.makedirs(OUTDIR, exist_ok = True)


## HELPER FUNCTIONS

In [6]:
def label_site_relative_position(sites_data):
    
    ## Compile a list of polyA site locations based on their assigned gene, chromosome, and strand
    
    sites_dict = {}

    for i,row in sites_data.iterrows():

        if (row['feature'] in ['utr3','utr3_extended']):

            rowkey = (row['gene'],row['chrom'],row['strand'])

            if not (rowkey in sites_dict):
                sites_dict[rowkey] = [row['start']]
            else:
                sites_dict[rowkey].append(row['start'])
                
    ## Label each included site with the relative position based on the assigned gene and strand
    
    sites_dict_labels = {}

    for rk,pl in sites_dict.items():

        sites_dict_labels[rk] = {}

        if (len(pl) == 1):
            sites_dict_labels[rk][pl[0]] = 'single'

        else:
            spl = sorted(pl)
            g,c,s = rk

            sites_dict_labels[rk][spl[0]] = 'first' if (s == '+') else 'last'
            sites_dict_labels[rk][spl[-1]] = 'last' if (s == '+') else 'first'

            for p in spl[1:-1]:
                sites_dict_labels[rk][p] = 'middle'
                
    ## Map these relative position labels back to individual sites
    
    sites_data['position'] = sites_data.apply(lambda row : sites_dict_labels.get((row['gene'],row['chrom'],row['strand']), {}).get(row['start'], np.nan), axis = 1)

    sites_data.loc[sites_data['feature'] == 'coding_exon', 'position'] = 'CDS'
    sites_data.loc[sites_data['feature'] == 'intron',      'position'] = 'intron'
    sites_data.loc[sites_data['feature'] == 'utr5',        'position'] = 'utr5'
    sites_data.loc[sites_data['feature'] == 'intergenic',  'position'] = 'intergenic'
                
    return sites_data


# ANALYSIS


### Prepare polyA site data

#### Load information for polyA sites

In [7]:
sites_scer = pd.concat([
    pd.read_csv(os.path.join(PROJECT, 'saccharomyces_cerevisiae', 'data', 'data_strength', 'redistA.clustered.wild_type.fwd.tf_0.75.ru_0.02.reads_10.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
    pd.read_csv(os.path.join(PROJECT, 'saccharomyces_cerevisiae', 'data', 'data_strength', 'redistA.clustered.wild_type.rev.tf_0.75.ru_0.02.reads_10.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
], ignore_index = True, sort = False)

sites_spom = pd.concat([
    pd.read_csv(os.path.join(PROJECT, 'schizosaccharomyces_pombe', 'data', 'data_strength', 'redistA.clustered.wild_type.fwd.tf_0.75.ru_0.02.reads_5.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
    pd.read_csv(os.path.join(PROJECT, 'schizosaccharomyces_pombe', 'data', 'data_strength', 'redistA.clustered.wild_type.rev.tf_0.75.ru_0.02.reads_5.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
], ignore_index = True, sort = False)

sites_atha = pd.concat([
    pd.read_csv(os.path.join(PROJECT, 'arabidopsis_thaliana', 'data', 'data_strength', 'redistA.clustered.wild_type.fwd.tf_0.75.ru_0.02.reads_10.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
    pd.read_csv(os.path.join(PROJECT, 'arabidopsis_thaliana', 'data', 'data_strength', 'redistA.clustered.wild_type.rev.tf_0.75.ru_0.02.reads_10.annotated.scored_cluster_utr3-utr3_extended.txt'), sep = '\t'),
], ignore_index = True, sort = False)

print(f"PolyA sites: S.cer={sites_scer.shape}, S.pom={sites_spom.shape}, A.tha={sites_atha.shape}")


PolyA sites: S.cer=(30931, 32), S.pom=(10588, 32), A.tha=(35443, 32)


#### Categorize sites based on relative position in the 3'UTR

In [8]:
sites_scer = label_site_relative_position(sites_scer)
sites_spom = label_site_relative_position(sites_spom)
sites_atha = label_site_relative_position(sites_atha)

print(f"With relative position information: S.cer={sites_scer.shape}, S.pom={sites_spom.shape}, A.tha={sites_atha.shape}")


With relative position information: S.cer=(30931, 33), S.pom=(10588, 33), A.tha=(35443, 33)


In [9]:
sites_scer.groupby(['feature','position']).size()


feature        position
coding_exon    CDS         8338
intron         intron        28
utr3           first       2642
               last         612
               middle      3434
               single       196
utr3_extended  first       2268
               last        4298
               middle      7693
               single       352
utr5           utr5        1070
dtype: int64

In [10]:
sites_spom.groupby(['feature','position']).size()


feature        position
coding_exon    CDS          827
intron         intron       127
utr3           first       2424
               last        1804
               middle      2513
               single      1175
utr3_extended  first        106
               last         726
               middle       313
               single       215
utr5           utr5         358
dtype: int64

In [11]:
sites_atha.groupby(['feature','position']).size()


feature        position
coding_exon    CDS           105
intron         intron        194
utr3           first        9468
               last         8483
               middle      11523
               single       4034
utr3_extended  first          64
               last         1049
               middle        219
               single        260
utr5           utr5           44
dtype: int64

#### Assign usage score of 1 for single sites in 3'UTRs

Originally all single sites are missing scores because we only scored sites where there were at least 2 sites in the 3'UTR or extended 3'UTR region. 


In [12]:
print(f"S.cer: {sites_scer.loc[sites_scer['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")
print(f"S.pom: {sites_spom.loc[sites_spom['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")
print(f"A.tha: {sites_atha.loc[sites_atha['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")


S.cer: {True: 548}
S.pom: {True: 1390}
A.tha: {True: 4294}


In [13]:
sites_scer.loc[sites_scer['position'] == 'single', 'raw_den_score'] = 1
sites_spom.loc[sites_spom['position'] == 'single', 'raw_den_score'] = 1
sites_atha.loc[sites_atha['position'] == 'single', 'raw_den_score'] = 1


In [14]:
print(f"S.cer: {sites_scer.loc[sites_scer['position'] == 'single', 'raw_den_score'].value_counts().to_dict()}")
print(f"S.pom: {sites_spom.loc[sites_spom['position'] == 'single', 'raw_den_score'].value_counts().to_dict()}")
print(f"A.tha: {sites_atha.loc[sites_atha['position'] == 'single', 'raw_den_score'].value_counts().to_dict()}")


S.cer: {1.0: 548}
S.pom: {1.0: 1390}
A.tha: {1.0: 4294}


In [15]:
print(f"S.cer: {sites_scer.loc[sites_scer['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")
print(f"S.pom: {sites_spom.loc[sites_spom['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")
print(f"A.tha: {sites_atha.loc[sites_atha['position'] == 'single', 'raw_den_score'].isna().value_counts().to_dict()}")


S.cer: {False: 548}
S.pom: {False: 1390}
A.tha: {False: 4294}


#### Format for output table

In [16]:
renamed_cols = {
    'chrom'         : 'Chrom',
    'cluster_start' : 'ClusterStart',
    'cluster_end'   : 'ClusterEnd',
    'start'         : 'RepresentativeSite',
    'strand'        : 'Strand',
    'gene'          : 'Gene',
    'feature'       : 'Feature', 
    'position'      : 'RelativePosition',
    'cluster_reads' : "Supporting3'READS",
    'raw_den_score' : 'Usage',
}

output_scer = sites_scer[list(renamed_cols.keys())].rename(columns = renamed_cols)
output_spom = sites_spom[list(renamed_cols.keys())].rename(columns = renamed_cols)
output_atha = sites_atha[list(renamed_cols.keys())].rename(columns = renamed_cols)


#### Record table

In [17]:
output_scer.to_csv(os.path.join(OUTDIR, 'polya_sites_table.scer.txt'), sep = '\t', index = False, na_rep = 'NA')
output_spom.to_csv(os.path.join(OUTDIR, 'polya_sites_table.spom.txt'), sep = '\t', index = False, na_rep = 'NA')
output_atha.to_csv(os.path.join(OUTDIR, 'polya_sites_table.atha.txt'), sep = '\t', index = False, na_rep = 'NA')
