In [2]:
import pandas as pd
from glob import glob
import pyranges as pr

In [3]:
# Import relevant quantfiles
quantfiles = sorted(glob('../../data/MihaDeseq/salmon_quantfiles/KO*FCL*') + glob('../../data/MihaDeseq/salmon_quantfiles/*S200*FCL*'))
quantfiles = [f for f in quantfiles if "bulk" not in f]

In [4]:
# Annotation
gtf = pr.read_gtf('../../data/genomes/GRCm38/gencode.vM22.primary_assembly.annotation.gtf', as_df=True)
gtf = gtf.loc[gtf.Feature == 'transcript']

In [5]:
TranscriptIds = gtf[['gene_id', 'transcript_id']]
TranscriptIds['stable_gene_id'] = TranscriptIds.gene_id.apply(lambda x: x.split('.')[0])
TranscriptIds['transcript_stable_id'] = TranscriptIds.transcript_id.apply(lambda x: x.split('.')[0])
TranscriptIds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,gene_id,transcript_id,stable_gene_id,transcript_stable_id
1,ENSMUSG00000102693.1,ENSMUST00000193812.1,ENSMUSG00000102693,ENSMUST00000193812
4,ENSMUSG00000064842.1,ENSMUST00000082908.1,ENSMUSG00000064842,ENSMUST00000082908
7,ENSMUSG00000051951.5,ENSMUST00000162897.1,ENSMUSG00000051951,ENSMUST00000162897
10,ENSMUSG00000051951.5,ENSMUST00000159265.1,ENSMUSG00000051951,ENSMUST00000159265
13,ENSMUSG00000051951.5,ENSMUST00000070533.4,ENSMUSG00000051951,ENSMUST00000070533


In [6]:
TranscriptIds[['transcript_stable_id', 'stable_gene_id']].to_csv('transcript_to_gene_mapping_mm10_V22.csv', index=False)

In [7]:
quantfiles

['../../data/MihaDeseq/salmon_quantfiles/KO_E12_FCL_1.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/KO_E12_FCL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/KO_E12_FCL_3.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200A_3_FCL_1.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200A_3_FCL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200A_3_FCL_3.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200A_5_FCL_1.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200A_5_FCL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_1_FCL_1.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_1_FCL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_2_FCL_3.quant.sf']

In [8]:
DfCounts = pd.DataFrame()
for i, f in enumerate(quantfiles):
    s = f.split('/')[-1].split('.')[0]
    df = pd.read_csv(f, sep='\t', index_col=0).rename(columns={'NumReads': s})
    df = df[s]
    # print(df.head())
    DfCounts[s] = df
# Convert transcript counts to gene counts
DfCounts = DfCounts.merge(TranscriptIds[['transcript_id', 'stable_gene_id']], left_on='Name', right_on='transcript_id', how='inner').set_index('transcript_id')
# Aggregate counts for the same gene_id
dfGeneCounts = DfCounts.groupby('stable_gene_id').sum().astype(int)
dfGeneCounts.to_csv('ReadCounts.csv')

In [9]:
DfCountsTPM = pd.DataFrame()
for i, f in enumerate(quantfiles):
    s = f.split('/')[-1].split('.')[0]
    df = pd.read_csv(f, sep='\t', index_col=0).rename(columns={'TPM': s})
    df = df[s]
    # print(df.head())
    DfCountsTPM[s] = df
# Convert transcript counts to gene counts
DfCountsTPM = DfCountsTPM.merge(TranscriptIds[['transcript_id', 'stable_gene_id']], left_on='Name', right_on='transcript_id', how='inner').set_index('transcript_id')
# Aggregate counts for the same gene_id
dfGeneCountsTPM = DfCountsTPM.groupby('stable_gene_id').sum().astype(float)
dfGeneCountsTPM.to_csv('TPMSum.csv')

In [12]:
dfGeneCountsTPM.head()

Unnamed: 0_level_0,KO_E12_FCL_1,KO_E12_FCL_2,KO_E12_FCL_3,S200A_3_FCL_1,S200A_3_FCL_2,S200A_3_FCL_3,S200A_5_FCL_1,S200A_5_FCL_2,S200WT_1_FCL_1,S200WT_1_FCL_2,S200WT_2_FCL_3
stable_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSMUSG00000000001,6.530924,9.36352,5.442524,2.005899,2.375329,8.920072,2.403916,9.464212,0.307615,2.139486,2.946704
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,7.710276,9.091545,7.334555,3.027887,5.469507,9.357579,4.251148,8.563081,2.505082,3.344192,4.11925
ENSMUSG00000000031,0.0,0.0,0.0,1.556927,0.58399,0.706341,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000037,0.016093,0.057227,0.041397,0.237239,0.064356,0.163073,0.117013,0.108123,0.454818,0.096248,0.19645


In [10]:
rows = []
for c in dfGeneCounts.columns:
    genotype = c.split('_')[0]
    condition = c.split('_')[2]
    row = [c, genotype, condition]
    rows.append(row)
DfMeta = pd.DataFrame(rows, columns=['sample', 'genotype', 'condition'])
DfMeta

Unnamed: 0,sample,genotype,condition
0,KO_E12_FCL_1,KO,FCL
1,KO_E12_FCL_2,KO,FCL
2,KO_E12_FCL_3,KO,FCL
3,S200A_3_FCL_1,S200A,FCL
4,S200A_3_FCL_2,S200A,FCL
5,S200A_3_FCL_3,S200A,FCL
6,S200A_5_FCL_1,S200A,FCL
7,S200A_5_FCL_2,S200A,FCL
8,S200WT_1_FCL_1,S200WT,FCL
9,S200WT_1_FCL_2,S200WT,FCL


In [11]:
dfGeneCounts.to_csv('ReadCounts.csv')
DfMeta.to_csv('Metadata.csv', index=False)