# Solution to exercise 2.3.1

- Retrieve a FASTA file named `data/sample.fa` using BioPython and answer the following questions:
  - How many sequences are in the file?
  - What are the IDs and the lengths of the longest and the shortest sequences?
  - Select sequences longer than 500bp. What is the average length of these sequences?
  - Calculate and print the percentage of GC in each of the sequences.
  - Write the newly created sequences into a FASTA file named `long_sequences.fa` 

In [None]:
from Bio import SeqIO

# read the FASTA file named data/sample.fa
seq_records = list(SeqIO.parse('../data/sample.fa', 'fasta'))

# find the number of sequences present in the file
num_seq = len(seq_records)
print('Total number of sequences:', num_seq)

In [None]:
# find IDs and lengths of the longest and the shortest sequences

# Create a Pandas dataframe for storing the Seq objects, their IDs and their sequences
import pandas

seq_ids = []
seq_seqs = []
seq_objs = []

for seq in seq_records:
    seq_ids.append(seq.id)
    seq_seqs.append(str(seq.seq))
    seq_objs.append(seq)

seq_df = pandas.DataFrame({"id": seq_ids, "seq": seq_seqs, 'seqobj': seq_objs})

# Calculate the length of each sequence
seq_df['len'] = seq_df['seq'].apply(len)

# Find shortest and longest sequence ids
shortest = seq_df.sort_values("len", ascending=True).iloc[0]
longest = seq_df.sort_values("len", ascending=False).iloc[0]
print('Longest sequence is', longest['id'], 'with length', longest['len'], 'bp')
print('Shortest sequence is', shortest['id'], 'with length', shortest['len'], 'bp')

In [None]:
print(seq_df.head())

In [None]:
# Calculate the average length of sequences longer than 500bp
# Calculate and print the percentage of GC contents

from Bio.SeqUtils import GC

# Calculate GC content 
seq_df['gc'] = seq_df['seq'].apply(GC)

# Filter sequences longer the 500bp
long_seq_df = seq_df[seq_df['len'] > 500]

print('Average length for sequences longer than 500bp is {}'.format(long_seq_df['len'].mean()))
print(long_seq_df[['id', 'gc']])

In [None]:
# Write sequences stored in dataframe as Seq objects in the long_seq_df in a file with 'GenBank' format
SeqIO.write(long_seq_df['seqobj'], 'long_sequences.fa', 'fasta')