In [1]:
import pandas as pd
import pybedtools as pbt
from glob import glob
import os
import pyranges as pr
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
import numpy as np
import scipy.stats as stat
sns.set_style("whitegrid")
sns.set_context("paper")

In [2]:
# Annotation
gtf = pr.read_gtf('../../data/genomes/GRCm38/gencode.vM22.primary_assembly.annotation.gtf', as_df=True)
gtf = gtf.loc[gtf.Feature == 'transcript']

In [3]:
geneAnnot = gtf[['gene_id', 'gene_name']].drop_duplicates()
geneAnnot.rename(columns={'gene_id': 'stable_gene_id'}, inplace=True)
geneAnnot.stable_gene_id = geneAnnot.stable_gene_id.apply(lambda x: x.split('.')[0])
geneAnnot.head()

Unnamed: 0,stable_gene_id,gene_name
1,ENSMUSG00000102693,4933401J01Rik
4,ENSMUSG00000064842,Gm26206
7,ENSMUSG00000051951,Xkr4
25,ENSMUSG00000102851,Gm18956
28,ENSMUSG00000103377,Gm37180


In [4]:
# Import files
# Crosslink files - merged replicates
BedFilesRaw = glob('../../data/LIN28_220626_results/Crosslinks/mergedXls/*.bed.gz') + glob('../../data/Pabpc1Pabpc4Iclip_2022/Crosslinks/mergedXls/*.bed.gz')
BedFilesRaw = [f for f in BedFilesRaw if 'AllSamples_merged.bed.gz' not in f]

# Quantseq data
quantfiles = [f for f in glob('../../data/MihaDeseq/salmon_quantfiles/S200*') if 'bulk' not in f] + glob('../../data/MihaDeseq/salmon_quantfiles/KO*')

# 300 nt upstream of 3P end for slamseq regions
ThreePrimeRegion = pbt.BedTool('../../data/3PrimeEnds/Quantseq/Btquantseq3PEndsExtended300Upstream.bed.gz').sort()

# All quantseq UTRs
minLen = 800
minTPM = 1
minXls = 5

SlamseqBedFull = pr.read_bed('../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed', as_df=True)
SlamseqBedFull['length'] = SlamseqBedFull.End - SlamseqBedFull.Start
SlamseqBedFull = SlamseqBedFull.loc[SlamseqBedFull.length >= minLen]
SlamseqBedFull = pbt.BedTool.from_dataframe(SlamseqBedFull).sort()


# Save to
outpath = f'../../results/CharacterizationOfQuantseqGroups/QuantificationOfBinding3PendsMergedReplicates'
os.makedirs(outpath, exist_ok=True)

In [5]:
# Keep only 3prime regions that are entirely within 3'UTRs
SlamseqBed = ThreePrimeRegion.intersect(SlamseqBedFull, s=True, u=True, f=1)

In [6]:
print(len(SlamseqBed), len(SlamseqBedFull), len(ThreePrimeRegion))

2340 2338 4876


In [7]:
TranscriptIds = gtf[['gene_id', 'transcript_id']]
TranscriptIds['stable_gene_id'] = TranscriptIds.gene_id.apply(lambda x: x.split('.')[0])
# map  TPM values from salmon quantseq files to respective transcript IDs
for f in quantfiles:
    name = f.split('/')[-1].split('.')[0]
    dfQ = pd.read_csv(f, sep='\t').rename(columns={'Name': 'transcript_id', 'TPM': f'{name}.TPM'})
    TranscriptIds = TranscriptIds.merge(dfQ[['transcript_id', f'{name}.TPM']], on='transcript_id', how='left')
# Drop nan rows
TranscriptIds = TranscriptIds.loc[~TranscriptIds.isna().any(axis='columns')]
# Sum TPMs for each gene
GeneTpms =TranscriptIds.groupby('stable_gene_id', as_index=False)[[c for c in TranscriptIds.columns if '.TPM' in c]].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
# Quantseq conditions map to dict
quantseqCond = {}
for c in GeneTpms.columns.tolist()[1:]:
    k = c.split('_')[0] + '_' + c.split('_')[2]
    if k not in quantseqCond.keys():
        quantseqCond[k] = [c]
    else:
        quantseqCond[k].append(c)

# Compute Mean TPM for each condition
for c, cols in quantseqCond.items():
    GeneTpms[f'Mean TPM {c}'] = GeneTpms[cols].mean(axis='columns')

In [9]:
# Make library size table for each sample
ls = []
for f in BedFilesRaw:
    df = pr.read_bed(f, as_df=True)
    libsize = df.Score.sum()
    ls.append(libsize)
DfLibsize = pd.DataFrame().from_dict({'sample': BedFilesRaw, 'librarysize': ls})

In [10]:
# Map xl counts to slamseq regions
SampleNames = [f.split('/')[-1].split('.')[0] for f in BedFilesRaw]
if os.path.exists(f'{outpath}/RawXlCounts.tsv'):
    DfQuantified = pd.read_csv(f'{outpath}/RawXlCounts.tsv', sep='\t', index_col=0)
else:
    for f in BedFilesRaw:
        print(f)
        SlamseqBed = SlamseqBed.map(pbt.BedTool(f).sort(), s=True, c=5, o='sum', null=0)
    # Convert to df
    DfQuantified = SlamseqBed.to_dataframe(header=None, names=['chrom', 'start', 'end', 'name', 'score', 'strand', 'annotation'] + SampleNames)
    # # Add one crosslink to all counts to mitigate zero-division error
    # DfQuantified[SampleNames] = DfQuantified[SampleNames] + 1
    DfQuantified['stable_gene_id'] = DfQuantified.annotation.apply(lambda x: x.split('_')[-1])
    DfQuantifiedRaw = DfQuantified.copy()
    DfQuantified.to_csv(f'{outpath}/RawXlCounts.tsv', sep='\t')

In [11]:
# Keep only entries with mean TPM greater than 1 in all conditions
print(GeneTpms.shape)
GeneTpms = GeneTpms.loc[(GeneTpms[[ c for c in GeneTpms.columns if 'Mean TPM' in c]] >= minTPM).all(axis='columns')]
print(GeneTpms.shape)

(54431, 31)
(8400, 31)


In [12]:
print(DfQuantified.groupby('name').count()['chrom'])
DfQuantified['minXlCount'] = DfQuantified[SampleNames].min(axis='columns')

name
CONTROL    1310
DOWN        559
UP          471
Name: chrom, dtype: int64


In [13]:
# Normalize counts in each sample to its libsize
DfLibsize['SampleName'] = DfLibsize['sample'].apply(lambda x: x.split('/')[-1].split('.')[0])
for i, row in DfLibsize.iterrows():
    s = row['SampleName']
    libs = row['librarysize']
    DfQuantified[s] = round(DfQuantified[s] * 10**6 / libs, 4)
# Compute length of each 3'UTR
DfQuantified['UtrLength'] = DfQuantified['end'] - DfQuantified['start']
# Add TPM sums to gene_ids
DfQuantified = DfQuantified.merge(GeneTpms[['stable_gene_id'] + [c for c in GeneTpms.columns if 'Mean TPM' in c]], on='stable_gene_id', how='left')
# Convert to long form df
DfQuantified = DfQuantified.melt(id_vars=[c for c in DfQuantified.columns if c not in SampleNames], value_vars=SampleNames, value_name='CPM', var_name='Sample')
# Merge with information on librarysize
DfQuantified = DfQuantified.merge(DfLibsize[['librarysize', 'SampleName']], left_on='Sample', right_on='SampleName', how='left')
DfQuantified.drop(columns=['SampleName'], inplace=True)
#  Assign protein to each sample
proteinDict = {}
for s in DfQuantified.Sample.unique():
    if 'LIN28A' in s:
        proteinDict[s] = 'LIN28A'
    elif '_C1_Lj' in s:
        proteinDict[s] = 'PABPC1_Lj'
    elif '_C1_Crick' in s:
        proteinDict[s] = 'PABPC1_Crick'
    elif '_C4_Proteintech' in s:
        proteinDict[s] = 'PABPC4_Proteintech'
    elif '_C4_Benthyl' in s:
        proteinDict[s] = 'PABPC4_Benthyl'
    else:
        pass
DfQuantified['Protein'] = DfQuantified.Sample.map(proteinDict)
# For each sample select the right Mean TPM value for normalization
conditionDict = {
    'S200WT_2iL' : ['LIN28A-WT_ESCiLIF_merged'],
    'S200WT_FCL' : [ 'LIN28A-WT_ESC_LIF-CHIR_merged', 'DOX_C1_Crick_merged', 'DOX_C1_Lj_merged', 'DOX_C4_Proteintech_merged', 'DOX_C4_Benthyl_merged',],
    'S200A_FCL' : ['LIN28A-S200A_ESC_LIF-CHIR-FGF0220626_MM_1_merged'],
    'KO_FCL' : [i for i in DfQuantified.Sample.unique() if 'KO' in i],
}
condDictRev = {}
for k, v in conditionDict.items():
    for el in v:
        condDictRev[el] = k
DfQuantified['Condition'] = DfQuantified.Sample.map(condDictRev)
# Based on sample condition, select the relevant Mean TPM value for each sample
DfQuantified['Mean TPM'] = DfQuantified.apply(lambda row: row.loc[f'Mean TPM {row.Condition}'], axis='columns')
DfQuantified['CPM per TPM'] = DfQuantified['CPM'].div(DfQuantified['Mean TPM'], axis='index')

In [14]:
DfQuantified = DfQuantified.merge(geneAnnot, on='stable_gene_id', how='left')
DfQuantified.drop(columns=['UtrLength', 'annotation'], inplace=True)
DfQuantified['normalizationCoefficient'] = DfQuantified[['librarysize', 'Mean TPM']].apply(lambda row: 10**6 / (row['librarysize'] * row['Mean TPM']), axis='columns')
DfQuantified.to_csv(f'{outpath}/QuantifiedClipSignalInSlamseq3UtrsUnfilteredXlCount.tsv', sep='\t', index=False)
DfQuantified = DfQuantified.loc[DfQuantified.minXlCount >= minXls]
DfQuantified.to_csv(f'{outpath}/QuantifiedClipSignalInSlamseq3Utrs.tsv', sep='\t', index=False)


In [15]:
# Save normalization coefficients for clipplotR
genesOfInterest = ['Tfcp2l1', 'Zfp281', 'Esrrb']
DfQuantified.loc[DfQuantified.gene_name.isin(genesOfInterest), ['gene_name', 'stable_gene_id', 'Sample', 'Protein', 'Condition', 'Mean TPM', 'normalizationCoefficient']].to_csv(f'../ClipPlotR_v1/GenesOfInterest_normdata.csv', index=False)