In [1]:
# Purpose of this notebook is the analysis of Direct RNA-seq datasets
# Author: Yuri Malina
# Date: 4/13/2023

### Requires:
# wget https://downloads.wormbase.org/releases/WS265/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS265.mRNA_transcripts.fa.gz
# gunzip c_elegans.PRJNA13758.WS265.mRNA_transcripts.fa.gz
# wget https://downloads.wormbase.org/releases/WS265/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS265.canonical_geneset.gtf.gz
# gunzip c_elegans.PRJNA13758.WS265.canonical_geneset.gtf.gz

In [7]:
# Import necessary packages
import pandas
import numpy
import plotly.express as px
import plotly.graph_objects as go
import pysam

In [17]:
# import csv file into pandas dataframe
polya_tsv_df = pandas.read_csv("/Data1/emmab/data/s_cer/polya_results.tsv", sep="\t")
bamfile = pysam.AlignmentFile('/Data1/emmab/data/s_cer/xol1_null_scer.sorted.bam', 'rb')


In [20]:
# Create an empty DataFrame to store the gene counts
gene_counts = pandas.DataFrame(columns=['Gene', 'Transcript Count'])

for i, read in enumerate(bamfile):
    if i >= 10:
        break

    # Print out the tags for this read
    print(read.tags)


# Iterate over each read in the .bam file
for read in bamfile:
    # Get the gene name from the read's tags
    gene_name = read.get_tag('SA')
    print(gene_name)

    # Increment the transcript count for this gene
    if gene_name in gene_counts['Gene'].values:
        gene_counts.loc[gene_counts['Gene'] == gene_name, 'Transcript Count'] += 1
    else:
        #Append new gene to the DataFrame
        gene_counts = gene_counts.concat([pandas.DataFrame({'Gene': gene_name, 'Transcript Count': 1})], ignore_index=True)
        #gene_counts = gene_counts.append({'Gene': gene_name, 'Transcript Count': 1}, ignore_index=True)

# Close the .bam file
bamfile.close()

# Print the gene counts
print(gene_counts)

[('NM', 80), ('ms', 1087), ('AS', 1078), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 228), ('s1', 1007), ('s2', 695), ('de', 0.04490000009536743), ('rl', 0)]
[('NM', 105), ('ms', 1015), ('AS', 1001), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 187), ('s1', 883), ('s2', 597), ('de', 0.05979999899864197), ('rl', 0)]
[('NM', 79), ('ms', 1087), ('AS', 1079), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 230), ('s1', 1015), ('s2', 662), ('de', 0.046799998730421066), ('rl', 0)]
[('NM', 81), ('ms', 1068), ('AS', 1061), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 235), ('s1', 1027), ('s2', 716), ('de', 0.04960000142455101), ('rl', 0)]
[('NM', 120), ('ms', 977), ('AS', 964), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 184), ('s1', 843), ('s2', 626), ('de', 0.06859999895095825), ('rl', 0)]
[('NM', 89), ('ms', 1071), ('AS', 1061), ('nn', 0), ('ts', '+'), ('tp', 'P'), ('cm', 227), ('s1', 1004), ('s2', 678), ('de', 0.050700001418590546), ('rl', 0)]
[('NM', 132), ('ms', 946), ('AS', 933), ('nn', 0), (

KeyError: "tag 'SA' not present"

In [None]:

#pandasgui.show(polya_tsv_df)
print(polya_tsv_df.head())

# import .gtf reference file into pandas dataframe
gtf_df = pandas.read_csv("/Data1/emmab/reference/c_elegans.PRJNA13758.WS265.canonical_geneset.gtf", sep="\t",skiprows=1,header=None,names=["seqname","source","feature","start","end","score","strand","frame","attribute"])

#Convert the attribute column into seperate columnes
gtf_df = gtf_df.join(gtf_df["attribute"].str.split(";",expand=True).add_prefix("attribute_"))
print(gtf_df.head())