# Retrieve a sequence from UniProt

In [None]:
import requests

accession = 'Q9BV79'

## Annotations

In [None]:
req_data = requests.get('http://www.uniprot.org/uniprot/' + accession + '.txt')

In [None]:
req_data.text

In [None]:
req_data.text.split('\n')

## Structures available?

In [None]:
for line in req_data.text.split('\n'):
    if 'PDB;' in line:
        print(line)

In [None]:
structdata = []
for line in req_data.text.split('\n'):
    if 'PDB;' in line:
        code = line.split(';')[1].strip()
        structdata.append(code)
print(structdata)

## Sequence

In [None]:
req_seq = requests.get('http://www.uniprot.org/uniprot/' + accession + '.fasta')
print(req_seq.text)

## Multiple proteins

In [None]:
accessions = ['P97584', 'P00328', 'P19096', 'S0DRI1', 'A2R6H1', 'Q29073', 'P34055', 'Q24K16', 'W7LKX1', 
              'P38230', 'Q4W4Z2', 'W7MT31', 'Q64413', 'Q9Z2M2', 'P00327', 'P49327', 'P12785', 'Q9SLN8',
              'A0A0D2YG10', 'P79896', 'P26646', 'P0DN30', 'F2Z678', 'Q9P6C8', 'O57380', 'P22797']

In [None]:
for accession in accessions:
    req_data = requests.get('http://www.uniprot.org/uniprot/' + accession + '.txt')
    structdata = [line for line in req_data.text.split('\n') if 'PDB;' in line]
    codes = [column.split(';')[1].strip() for column in structdata]
    if len(codes) > 0:
        print(accession)
        print(','.join(codes))
        print()

# BLAST

In [None]:
query_fasta = req_seq.text
print(query_fasta)

In [None]:
from Bio.Blast import NCBIWWW

result_handle = NCBIWWW.qblast("blastp", "swissprot", query_fasta)

from Bio.Blast import NCBIXML

blast_record = NCBIXML.read(result_handle)

In [None]:
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
       print('****Alignment****')
       print('sequence:', alignment.title)
       print('length:', alignment.length)
       print('e value:', hsp.expect)

In [None]:
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 10**-20:
            print(alignment.title)

In [None]:
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 10**-20:
            print(alignment.title.split('|')[3])

In [None]:
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 10**-20:
            raw_accession = alignment.title.split('|')[3]
            print(raw_accession[:raw_accession.index('.')])

In [None]:
blast_accessions = []

for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 10**-20:
            raw_accession = alignment.title.split('|')[3]
            blast_accessions.append(raw_accession[:raw_accession.index('.')])
    
    
print(blast_accessions)

In [None]:
for accession in blast_accessions:
    req_data = requests.get('http://www.uniprot.org/uniprot/' + accession + '.txt')
    structdata = [line for line in req_data.text.split('\n') if 'PDB;' in line]
    codes = [column.split(';')[1].strip() for column in structdata]
    if len(codes) > 0:
        print(accession)
        print(','.join(codes))
        print()

# Multiple sequence alignment

## Get sequences

In [None]:
fasta_sequences = [] 
for accession in blast_accessions:
    req_seq = requests.get('http://www.uniprot.org/uniprot/' + accession + '.fasta')
    fasta_sequences.append(req_seq.text)

for sequence in fasta_sequences:
    print(sequence)

In [None]:
outfile = open('blast_hits.fasta', 'w')
for sequence in fasta_sequences:
        outfile.write(sequence)

### Make multiple sequence alignment in Clustal Omega

In [None]:
!clustalo -i blast_hits.fasta -o blast_hits.cali

### Continue with analysis

In [None]:
from Bio import AlignIO
alignment = AlignIO.read("blast_hits.cali", "fasta")

In [None]:
for seq in alignment:
    print(seq.seq[:100])

In [None]:
from Bio.Align import AlignInfo

summary_align = AlignInfo.SummaryInfo(alignment)

## Consensus sequence

In [None]:
summary_align.dumb_consensus()

## PSSM

In [None]:
pssm = summary_align.pos_specific_score_matrix()

print(pssm)

In [None]:
for position in pssm:
    print(max(position.values())/len(alignment))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


values = []
for position in pssm:
    values.append(max(position.values())/len(alignment))
    
plt.plot(values)

# Assignment 1

## Handle sequences and perform alignment

In [None]:
import requests

cyp21_seq = requests.get('http://www.uniprot.org/uniprot/P08686.fasta').text

mut_seq = '''>cyp21a2_translated_mut
MLLLGLLLLLPLLAGARLLWNWWKLRSLHLPPLAPGFLHLLQPDLPIYLLGLTQKFGPIY
RLHLGLQDVVVLNSKRTIEEAMVKKWADFAGRPEPLTYRLVSKNYPDLSLGDYSLLWKAH
KKLTRSALLLGIRDSMEPVVEQLTQEFCERMRAQPGTPVAIEEEFSLLTCSIICYLTFGD
KIKDDNLMPAYYKCIQEVLKTWSHWSIQIVDVIPFLRFFPNPGLRRLKQAIEKRDHIVEM
QLRQHKESLVAGQWRDMMDYMLQGVAQPSMEEGSGQLLEGHVHMAAVDLLISGTETTANT
LSWAVVFLLHHPEIQQRLQEELDHELGPGASSSRVPYKDRARLPLLNATIAEVLRLRPVV
PLALPHRTTRPSSISGYDIPEGTVIIPNLQGAHLDETVWERPHEFWPDRFLEPGKNSRAL
AFGCGARVRLGEPLARLELFVVLTRLLQAFTLLPSGDALPSLQPLPHCSVILKMQPFQVR
LQPRGMGAHSPGQNQ'''

with open('cyp21_comparison.fasta', 'w') as outfile:
    outfile.write(cyp21_seq)
    outfile.write(mut_seq)

In [None]:
! kalign cyp21_comparison.fasta > cyp21_comparison.kali 2> /dev/null

## Find mutations

In [None]:
from Bio import AlignIO
alignment = AlignIO.read("cyp21_comparison.kali", "fasta")

print('Pos\tCYP21\tMut')
muts = []
prot_pos = 0
for i in range(len(alignment[0])):
    if alignment[0][i] != alignment[1][i]:
        muts.append([str(prot_pos+1), alignment[0][i], alignment[1][i]])
        print('\t'.join(muts[-1]))
    if alignment[0][i] != '-':
        prot_pos += 1


## Look for information in UniProt

In [None]:
cyp21_data = requests.get('http://www.uniprot.org/uniprot/P08686.txt').text

positions = []
for mut in muts:
    positions.append(mut[0])

for line in cyp21_data.split('\n'):
    if line[:2] == 'FT': # Feature Table
        for pos in positions:
            if ' ' + pos + ' ' in line[:27]:
                    print(line)