## Count features in a single GenBank file using Python

- Libraries
  - Biopython   (*pip install biopython --user*)
  - Pandas      (*pip install pandas --user*)
- A publicly available data will be used

In [16]:
from Bio import SeqIO
import pandas as pd
from collections import Counter

In [17]:
file_path='/home/kobina/Desktop/M48.gb'

In [18]:
genbank_object=SeqIO.read(file_path,'gb')

In [19]:
all_feature_types=[feature.type for feature in genbank_object.features]

In [20]:
len(all_feature_types)

6513

In [23]:
feature_types=set(all_feature_types)
print(feature_types)

{'ncRNA', 'regulatory', 'rRNA', 'tmRNA', 'source', 'tRNA', 'gene', 'CDS'}


In [24]:
feature_counts=Counter(all_feature_types)

In [26]:
feature_counts.keys()

dict_keys(['source', 'gene', 'CDS', 'regulatory', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [27]:
feature_counts['gene']

3251

In [28]:
for key,value in feature_counts.items():
    print(key,value)

source 1
gene 3251
CDS 3171
regulatory 10
tRNA 60
ncRNA 3
rRNA 16
tmRNA 1


In [31]:
del feature_counts['source']
del feature_counts['regulatory']

In [32]:
feature_counts.keys()

dict_keys(['gene', 'CDS', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [33]:
dataframe=pd.DataFrame(feature_counts.items(),columns=['Feature','Count'])

In [34]:
dataframe.shape

(6, 2)

In [35]:
dataframe

Unnamed: 0,Feature,Count
0,gene,3251
1,CDS,3171
2,tRNA,60
3,ncRNA,3
4,rRNA,16
5,tmRNA,1


In [36]:
outputfile='/home/kobina/Desktop/feature_count.csv'

In [37]:
dataframe.to_csv(outputfile,index=False)

## Count features in multiple genbank files (Part1)


**Activities**
- Download genbank files
- Put all files in the same folder/directory
- Count features using biopython
- Save the result for individual files to their respective output files

**Libraries**
  - Biopython   (*pip install biopython --user*)
  - Pandas      (*pip install pandas --user*)

In [5]:
import glob
from Bio import SeqIO
from collections import Counter
import pandas as pd
import os

In [6]:
file_directory="/home/kobina/Desktop/sequences"

In [9]:
gfiles=glob.glob("%s/*.gb"%file_directory)

In [10]:
gfiles

['/home/kobina/Desktop/sequences/V521.gb',
 '/home/kobina/Desktop/sequences/AR465.gb',
 '/home/kobina/Desktop/sequences/R50.gb',
 '/home/kobina/Desktop/sequences/P10.gb',
 '/home/kobina/Desktop/sequences/M48.gb']

In [11]:
print(len(gfiles))

5


In [12]:
gfiles[0]

'/home/kobina/Desktop/sequences/V521.gb'

In [19]:
def count_features(gfile):
    genbank_object=SeqIO.read(gfile,"gb")
    features=genbank_object.features
    feature_types=[feature.type for feature in features]
    feature_count=Counter(feature_types)
    print('features have been counted')
    
    dataframe=pd.DataFrame(feature_count.items(),columns=['Feature','Count'])
    
    directory,filename=os.path.split(gfile)
    filename=filename.strip('.gb')
    
    basedir='/home/kobina/Desktop'
    
    outputfile='%s/%s.csv'%(basedir,filename)
    
    dataframe.to_csv(outputfile,index=False)
    
    print('Count data has been saved')

In [20]:
for gfile in gfiles:
    count_features(gfile)

features have been counted
Count data has been saved
features have been counted
Count data has been saved
features have been counted
Count data has been saved
features have been counted
Count data has been saved
features have been counted
Count data has been saved


### Count features in multiple genbank files (Part 2)

 **Activities**
- Download genbank files
- Put all files in the same folder/directory
- Count features using biopython
- Combine all results into a single dataframe and save to an single output file

**Libraries**
- Biopython   (*pip install biopython --user*)
- Pandas      (*pip install pandas --user*)

In [23]:
import glob
from Bio import SeqIO
from collections import Counter
import pandas as pd
import os

In [24]:
file_directory="/home/kobina/Desktop/sequences"

In [27]:
gfiles=glob.glob("%s/*.gb"%file_directory)

In [28]:
gfiles

['/home/kobina/Desktop/sequences/V521.gb',
 '/home/kobina/Desktop/sequences/AR465.gb',
 '/home/kobina/Desktop/sequences/R50.gb',
 '/home/kobina/Desktop/sequences/P10.gb',
 '/home/kobina/Desktop/sequences/M48.gb']

In [29]:
print(len(gfiles))

5


In [30]:
gfiles[0]

'/home/kobina/Desktop/sequences/V521.gb'

In [31]:
def read_file(gfile):
    genbank_object=SeqIO.read(gfile,'gb')
    features=genbank_object.features
    feature_types=[feature.type for feature in features]
    return feature_types

In [50]:
def count_features(feature_types):
    feature_count=Counter(feature_types)
    print('features have been counted')
    return feature_count

In [32]:
def scan_all_features(files):
    allfeatures=[]
    for gfile in gfiles:
        feature_types=read_file(gfile)
        allfeatures.extend(feature_types)

    allfeatures=set(allfeatures)
    allfeatures=list(allfeatures)
    print('all features have been identified')
    return allfeatures

In [35]:
allfeatures=scan_all_features(gfiles)

all features have been identified


In [36]:
print(allfeatures)

['source', 'CDS', 'gene', 'ncRNA', 'tmRNA', 'regulatory', 'tRNA', 'misc_binding', 'rRNA']


In [61]:
allfeatures

['source',
 'CDS',
 'gene',
 'ncRNA',
 'tmRNA',
 'regulatory',
 'tRNA',
 'misc_binding',
 'rRNA']

In [56]:
allfeature_count=[]

In [57]:
for gfile in gfiles:
    directory,filename=os.path.split(gfile)
    filename=filename.strip('.gb')
    feature_types=read_file(gfile)
    feature_count=count_features(feature_types)
    temp_count=[]

    temp_count.append(filename)
   
    for feature in allfeatures:
        if feature in feature_count.keys():
            temp_count.append(feature_count[feature])
        else:
            temp_count.append(0)
    allfeature_count.append(temp_count)

features have been counted
features have been counted
features have been counted
features have been counted
features have been counted


In [58]:
len(allfeature_count)


5

In [59]:
allfeature_count[0]

['V521', 1, 3114, 3194, 3, 1, 0, 60, 10, 16]

In [62]:
print(allfeatures)

['source', 'CDS', 'gene', 'ncRNA', 'tmRNA', 'regulatory', 'tRNA', 'misc_binding', 'rRNA']


In [63]:
allfeature_count

[['V521', 1, 3114, 3194, 3, 1, 0, 60, 10, 16],
 ['AR465', 1, 2740, 2819, 0, 0, 0, 60, 0, 19],
 ['R50', 1, 2935, 3008, 3, 1, 10, 59, 0, 10],
 ['P10', 1, 3179, 3250, 3, 1, 10, 59, 0, 8],
 ['M48', 1, 3171, 3251, 3, 1, 10, 60, 0, 16]]

In [64]:
columns=[]
columns.append('File')
columns.extend(allfeatures)

In [65]:
columns

['File',
 'source',
 'CDS',
 'gene',
 'ncRNA',
 'tmRNA',
 'regulatory',
 'tRNA',
 'misc_binding',
 'rRNA']

In [66]:
dataframe=pd.DataFrame(allfeature_count,columns=columns)

In [67]:
print(dataframe)

    File  source   CDS  gene  ncRNA  tmRNA  regulatory  tRNA  misc_binding  \
0   V521       1  3114  3194      3      1           0    60            10   
1  AR465       1  2740  2819      0      0           0    60             0   
2    R50       1  2935  3008      3      1          10    59             0   
3    P10       1  3179  3250      3      1          10    59             0   
4    M48       1  3171  3251      3      1          10    60             0   

   rRNA  
0    16  
1    19  
2    10  
3     8  
4    16  


In [68]:
del dataframe['regulatory']

In [69]:
del dataframe['misc_binding']

In [70]:
print(dataframe)

    File  source   CDS  gene  ncRNA  tmRNA  tRNA  rRNA
0   V521       1  3114  3194      3      1    60    16
1  AR465       1  2740  2819      0      0    60    19
2    R50       1  2935  3008      3      1    59    10
3    P10       1  3179  3250      3      1    59     8
4    M48       1  3171  3251      3      1    60    16


In [71]:
dataframe.set_index('File',inplace=True)

In [72]:
dataframe

Unnamed: 0_level_0,source,CDS,gene,ncRNA,tmRNA,tRNA,rRNA
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
V521,1,3114,3194,3,1,60,16
AR465,1,2740,2819,0,0,60,19
R50,1,2935,3008,3,1,59,10
P10,1,3179,3250,3,1,59,8
M48,1,3171,3251,3,1,60,16


In [73]:
dataframe.loc['V521',:]

source       1
CDS       3114
gene      3194
ncRNA        3
tmRNA        1
tRNA        60
rRNA        16
Name: V521, dtype: int64

In [74]:
dataframe.loc['V521','CDS']

3114

In [75]:
outputfile='/home/kobina/Desktop/featurecount.csv'

In [76]:
dataframe.to_csv(outputfile,index=True)