In [1]:
import pandas as pd
from glob import glob

In [2]:
# Import Files
# Crosslink bed files
rawBed = sorted(glob('../../data/LIN28_220626_results/Crosslinks/*bed*')) + \
        sorted(glob('../../data/Pabpc1Pabpc4Iclip_2022/Crosslinks/*.bed.gz')) +\
        sorted(glob('../../data/Pabpc1Pabpc4Iclip_2022/Crosslinks/mergedXls/*merged*.bed*')) +\
        sorted(glob('../../data/LIN28_220626_results/Crosslinks/mergedXls/*merged*bed*'))

naivePabpc1 = sorted(glob('../../data/Pabpc1Crosslinks/PABPC1_ESC_DKO*.bed.gz'))
fnames = [f.split('/')[-1] for f in rawBed]

outpath = '../../data/general'

In [3]:
proteinDict = {}
for f in fnames:
    if 'LIN28A' in f:
        proteinDict[f] = 'LIN28A'
    elif '_C1_' in f:
        proteinDict[f] = 'PABPC1'
    elif '_C4_' in f:
        proteinDict[f] = 'PABPC4'
    else:
        pass

In [4]:
conditionDict = {
    'S200WT_2iL' : [f for f in fnames if 'LIN28A-WT_ESCiLIF' in f],
    'S200WT_FCL' : [f for f in fnames if ('LIN28A-WT_ESC_LIF-CHIR' in f) or ('DOX_' in f)],
    'S200A_FCL' : [f for f in fnames if 'LIN28A-S200A_ESC_LIF-CHIR' in f],
    'KO_FCL' : [i for i in fnames if 'KO' in i],
}

revDict = {}
for c, sampls in conditionDict.items():
    for s in sampls:
        revDict[s] = c

In [5]:
# Add other experimental details
expDict = {}
for f in fnames:
    if 'LIN28A' in f:
        expDict[f] = 'FLAG-IP LIN28A'
    elif 'Lj' in f:
        expDict[f] = 'Lj'
    elif 'Crick' in f:
        expDict[f] = 'Crick'
    elif 'Proteintech' in f:
        expDict[f] = 'Proteintech'
    elif 'Benthyl' in f:
        expDict[f] = 'Benthyl'
    else:
        pass

In [6]:
SampleAnnotation = pd.DataFrame()
SampleAnnotation['Sample'] = fnames

merged_files = [s for s in SampleAnnotation.Sample.unique() if '_merged.' in s]

SampleAnnotation['Protein'] = SampleAnnotation.Sample.map(proteinDict)
SampleAnnotation['Condition'] = SampleAnnotation.Sample.map(revDict)
SampleAnnotation['Experiment'] = SampleAnnotation.Sample.map(expDict)
SampleAnnotation.loc[SampleAnnotation.Sample.isin(merged_files), 'Experiment'] = SampleAnnotation.loc[SampleAnnotation.Sample.isin(merged_files), 'Experiment'].apply(lambda x: f'{x} - merged')
SampleAnnotation

Unnamed: 0,Sample,Protein,Condition,Experiment
0,LIN28A-S200A_ESC_LIF-CHIR-FGF0220626_MM_1.bed.gz,LIN28A,S200A_FCL,FLAG-IP LIN28A
1,LIN28A-WT_ESC_LIF-CHIR-FGF0220626_MM_1.bed.gz,LIN28A,S200WT_FCL,FLAG-IP LIN28A
2,LIN28A-WT_ESC_LIF-CHIR-FGF0220626_MM_2.bed.gz,LIN28A,S200WT_FCL,FLAG-IP LIN28A
3,LIN28A-WT_ESCiLIF-OLD0220626_MM.bed.gz,LIN28A,S200WT_2iL,FLAG-IP LIN28A
4,LIN28A-WT_ESCiLIF0220626_MM_1.bed.gz,LIN28A,S200WT_2iL,FLAG-IP LIN28A
5,LIN28A-WT_ESCiLIF0220626_MM_2.bed.gz,LIN28A,S200WT_2iL,FLAG-IP LIN28A
6,DOX_C1_Crick1.bed.gz,PABPC1,S200WT_FCL,Crick
7,DOX_C1_Crick2.bed.gz,PABPC1,S200WT_FCL,Crick
8,DOX_C4_Benthyl_1.bed.gz,PABPC4,S200WT_FCL,Benthyl
9,DOX_C4_Benthyl_2.bed.gz,PABPC4,S200WT_FCL,Benthyl


In [7]:
for f in naivePabpc1:
    fname = f.split('/')[-1]
    protein = 'PABPC1'
    if 'nodox' in fname:
        Condition = 'KO_2iL'
    else:
        Condition = 'S200WT_2iL'
    Experiment = 'PABPC1 CLIP LIN28 overexpression'
    row = pd.Series(data={'Sample': fname, 'Protein': protein, 'Condition': Condition, 'Experiment': Experiment})
    SampleAnnotation = pd.concat([SampleAnnotation, row.to_frame().T], ignore_index=True)

In [8]:
SampleAnnotation.tail()

Unnamed: 0,Sample,Protein,Condition,Experiment
27,LIN28A-S200A_ESC_LIF-CHIR-FGF0220626_MM_1_merg...,LIN28A,S200A_FCL,FLAG-IP LIN28A - merged
28,LIN28A-WT_ESC_LIF-CHIR_merged.bed.gz,LIN28A,S200WT_FCL,FLAG-IP LIN28A - merged
29,LIN28A-WT_ESCiLIF_merged.bed.gz,LIN28A,S200WT_2iL,FLAG-IP LIN28A - merged
30,PABPC1_ESC_DKO_doxGFPLin28A_grouped_cdna_mm39....,PABPC1,S200WT_2iL,PABPC1 CLIP LIN28 overexpression
31,PABPC1_ESC_DKO_nodox_grouped_cdna_mm39.bed.gz,PABPC1,KO_2iL,PABPC1 CLIP LIN28 overexpression


In [9]:
SampleAnnotation.to_csv(f'{outpath}/SampleAnnotation.csv')