# Sequence Alignment and Split data into training and testing

In [1]:
## 1. Perform Sequence Alignment
## 2. Check the sequence alignment with the sequence alignment from Clustal Omega -> Skipped in this file
## 3. Split the set into training and testing set randomly with a ratio of 80:20

In [2]:
## Input:
gene_original_file = "./OXTR_bonyFish_edited.fasta" #Taken from Data Processing
## Output:
scale_file = "./OXTR_bonyFish_scaled.fasta" #Scale all sequences to the same length
aligned_sequence_file =  "./OXTR_bonyFish_aligned.fasta" #Aligned sequence
# aligned_sequence_clustal_omega_file = "./OXTR_primate_co.fasta" #Aligned sequence from Clustal Omega for checking
train_file = "./OXTR_bonyFish_train.fasta" ## Training data
test_file = "./OXTR_bonyFish_test.fasta" ## Testing data
## Set default original file
original_file = gene_original_file

In [3]:
#libs
import pandas as pd
from Bio import AlignIO
from Bio import SeqIO
from Bio import Phylo
from sklearn.model_selection import train_test_split
from random import randrange

## Scale data prior sequence alignment
## All sequence will scale to the max sequence length and empty space are filled with -

In [4]:
maxlen = 0
with open(original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("Max sequence length: ", maxlen)

#Add creating correct_file code
original_file = gene_original_file
corrected_file = scale_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        while (len(record.seq) < maxlen):
            record.seq = record.seq + "-"
        SeqIO.write(record, corrected, 'fasta')

Max sequence length:  393


In [5]:
## Sequence Alignment
alignment = AlignIO.read(open(scale_file), "fasta")

In [6]:
## Output the alignment file
            
sequences = []
for record in alignment:
    sequences.append(record.seq)

maxlen = 0
total_sequence = 0
with open(gene_original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)

original_file = scale_file
corrected_file = aligned_sequence_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        for align in alignment:
            if (record.id == align.id ):
                record.seq = align.seq
        SeqIO.write(record, corrected, 'fasta')

### Check alignment tools to make sure the program function correctly 
### with alignment of gene_original_file to ClustalOmega file

In [7]:
## Skipped in this file since we already check it in the previous work
# count_correct = total_sequence
# control = 0
# print("Number of total sequence check:")
# print(count_correct)
# with open(aligned_sequence_clustal_omega_file) as handle:
#     for record in SeqIO.parse(handle, "fasta"):
#         print("Sequence id check:")
#         print(record.id)
#         for alig in alignment:
#             if ((record.seq == alig.seq) and (record.id == alig.id)):
#                 count_correct = count_correct - 1
#                 print(count_correct)
#                 control = 1
#                 break
#         print("Result:")
#         if (control == 1):
#             print("Matching!")
#         else:
#             print("Not Matching!")
#         control = 0
        
# if (count_correct == 0):
#     print("All of the alignment matched!")
# else:
#     print(count_correct, " of the alignment does not match")

In [8]:
#Split data code
random = randrange(100) 
sequences_edit = []
for record in alignment:
    sequences_edit.append(record.id)

X = sequences_edit
X_train, X_test = train_test_split(X, test_size=0.2, random_state=random)
print("random seed: ", random)

random seed:  38


In [9]:
#Convert X_train these into a fasta file script
print("training data:")
check =[]
original_file = aligned_sequence_file
corrected_file = train_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        for align in X_train:
            if (record.id == align):
                SeqIO.write(record, corrected, 'fasta')
                check.append(record)
                print(record)
                break               

training data:
ID: XP_031613004.1
Name: XP_031613004.1
Description: XP_031613004.1 oxytocin receptor [Oreochromis aureus]
Number of features: 0
Seq('MESISNESDIWQFNESWRNSSLINGTGGLNQTNPLKRNEEVARVEVTVLALVLF...STT')
ID: XP_017271415.1
Name: XP_017271415.1
Description: XP_017271415.1 oxytocin receptor [Kryptolebias marmoratus]
Number of features: 0
Seq('MESVSGDGDMWPFNESWRNSSLLNGTADWNQTNPLKRNEEVAKVEVTVLALVLF...STT')
ID: XP_012719163.1
Name: XP_012719163.1
Description: XP_012719163.1 oxytocin receptor [Fundulus heteroclitus]
Number of features: 0
Seq('METISNESDIWQLNEPWRNSTLLNGTIGVNQTNPLKRNEEVAKVEVTVLALVLF...---')
ID: XP_031143641.1
Name: XP_031143641.1
Description: XP_031143641.1 oxytocin receptor [Sander lucioperca]
Number of features: 0
Seq('MESISNESEFWQVNESWQNPSLVNGTGLLNQTNPLKRNEEVAKVEVTVLALVLF...STT')
ID: XP_023136471.1
Name: XP_023136471.1
Description: XP_023136471.1 oxytocin receptor [Amphiprion ocellaris]
Number of features: 0
Seq('MESTSSLINGTGGLNQTNPLKRNEEVAKVEVTVLALVLFLALAGNLCVLLAIHT

In [10]:
#Convert X_test these into a fasta file script
print("testing data:")
check =[]
original_file = aligned_sequence_file
corrected_file = test_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        for align in X_test:
            if (record.id == align):
                SeqIO.write(record, corrected, 'fasta')
                check.append(record)
                print(record)
                break          

testing data:
ID: NP_001186299.1
Name: NP_001186299.1
Description: NP_001186299.1 oxytocin receptor [Danio rerio]
Number of features: 0
Seq('MEDIFKDQDFWSFNESSRNATNETYGVNQTVNPLKRNEEVAKVEVTVLALVLFL...---')
ID: NP_001243561.1
Name: NP_001243561.1
Description: NP_001243561.1 oxytocin receptor [Oryzias latipes]
Number of features: 0
Seq('MEIISNESEIWQFNGSWRNSSLGNGTGALNQTNPLKRNEEVAKVEVTVLALVLF...STT')
ID: XP_033482864.1
Name: XP_033482864.1
Description: XP_033482864.1 oxytocin receptor [Epinephelus lanceolatus]
Number of features: 0
Seq('MESISNDSDFWQFNESWRNSSLGNGTNWLNQTNPLKRNEEVAKVEVTVLALVLF...STT')
ID: XP_030277370.1
Name: XP_030277370.1
Description: XP_030277370.1 oxytocin receptor [Sparus aurata]
Number of features: 0
Seq('MESVSNESDFWQSNESWRNSSLVNGTGWLNQTNPLKRNEEVAKVEVTVLALVLF...STT')
ID: XP_028431436.1
Name: XP_028431436.1
Description: XP_028431436.1 oxytocin receptor [Perca flavescens]
Number of features: 0
Seq('MESISNKSEFWQFNESWQNPSLINGTGLLNQTNPLKRNEEVAKVEVTVLALVLF...STT')


In [11]:
##Remove unwanted files
##Can comment to get the file
import os
os.remove(scale_file)