# Data processing

#### 1. Check if all sequences are unique, and output a file with unique sequence
#### 2. Only take sequence with decription of "oxytocin receptor", output a file with only oxytocin receptor sequence in description
#### 3. If there is more than 25 sequences in the file, output a file with only 25 sequences

In [1]:
## Input:
gene_original_file = "./OXTR_bonyFish.fasta"
gene_name = "oxytocin receptor"
max_number_sequences = 25
max_wanted_length = 400
## Output:
unique_id_file = "./OXTR_bonyFish_unique_id.fasta"
unique_file = "./OXTR_bonyFish_unique.fasta"
gene_name_only_file = "./OXTR_bonyFish_only.fasta"
under_max_length_sequence_file = "./OXTR_boneyFish_under_max.fasta"
max_number_sequences_file = "./OXTR_bonyFish_edited.fasta" #Final output file
## Set default original file
original_file = gene_original_file

#### Every block of code has it own call for file name to help the reader keeps track of the file easier
#### The code will try to Parsing a fasta file without having to place it in memory 
#### For fast input and output, please use the code from "clade process"

### Support library

In [2]:
#libs
from Bio import SeqIO

### 1. Sequence uniqueness check

In [3]:
## Take the max length of the sequences
maxlen = 0 ## Max sequence length
total_sequence = 0 ## Counter for sequences
with open(original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the sequence: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

The maximum length of the sequence:  947
The numbers of the sequence:  216


In [4]:
## Check duplicate helper
def duplicate_check(a):
    if len(a) == len(set(a)):
        return False
    else:
        return True

In [5]:
## Check sequence uniqueness
sequences = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences.append(record.id)
        total_sequence = total_sequence + 1
        
if duplicate_check(sequences):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

No duplicate, all sequences in the file are unique


In [6]:
## Here is the code to get the duplicate sequence
def duplicate_get(input_list, duplicate_list):   
    for item in input_list:
        if input_list.count(item) > 1:
            duplicate_list.append(item)

In [7]:
duplicate_sequences = []
duplicate_get(sequences,duplicate_sequences)
duplicate_sequences = list(set(duplicate_sequences))
print("ID of Sequence that has duplicates: ")
print(duplicate_sequences)

ID of Sequence that has duplicates: 
[]


In [8]:
## take out duplicate and get unique sequences
unique_sequences =[]
[unique_sequences.append(item) for item in sequences if item not in unique_sequences]
print("ID of unique sequence: ")
print(unique_sequences)

ID of unique sequence: 
['NP_001186299.1', 'XP_031613004.1', 'XP_017271415.1', 'NP_001186298.1', 'XP_021466000.1', 'XP_021465666.1', 'XP_021424878.1', 'XP_021424474.1', 'XP_012719163.1', 'XP_031143641.1', 'XP_023136471.1', 'XP_006806769.1', 'XP_026853410.1', 'NP_001243561.1', 'XP_034169927.1', 'XP_026774982.1', 'XP_026774972.1', 'XP_019910791.1', 'XP_010896257.1', 'XP_010879624.1', 'XP_034079709.1', 'XP_034022503.1', 'XP_033939362.1', 'XP_033939302.1', 'XP_033994521.1', 'XP_033994465.1', 'XP_033822329.1', 'sp|Q90334.1|ITR_CATCO', 'XP_033482864.1', 'XP_033482863.1', 'XP_033481840.1', 'TMS19138.1', 'KAF4112490.1', 'KAF4112433.1', 'KAF4079174.1', 'XP_026205203.1', 'NP_001281111.1', 'XP_026163763.1', 'XP_026163035.1', 'KAF3856231.1', 'KAF3689821.1', 'XP_032406385.1', 'XP_032406286.1', 'XP_032370324.1', 'KAF0034964.1', 'XP_031711301.1', 'XP_020337837.1', 'XP_020337813.1', 'XP_020356088.1', 'XP_020352525.1', 'XP_031424296.1', 'XP_012680256.2', 'XP_012688667.1', 'KAE8298606.1', 'KAE8298476.1'

In [9]:
### Script to take out all duplicates sequences and convert it into a new file with all unique sequences by ID
original_file = gene_original_file
corrected_file = unique_id_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in unique_sequences:
        for record in records:
            if (sequence == record.id):
                SeqIO.write(record, corrected, 'fasta')
                break

In [10]:
sequences_unique_id = [] ## Store all sequences in the list
with open(unique_id_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences_unique_id.append(record.description)

In [11]:
def get_name (listI,listO):
    for i in listI:
        if "[" in i:
            split_string = i.split("[", 1)
            substring = split_string[1]
            split_string = substring.split("]", 1)
            substring = split_string[0]
            listO.append(substring)

In [12]:
sequence_species = []
get_name(sequences_unique_id,sequence_species)

In [13]:
unique_sequences_species = []
[unique_sequences_species.append(item) for item in sequence_species if item not in unique_sequences_species]
print("name of unique sequence: ")
print(unique_sequences_species)

name of unique sequence: 
['Danio rerio', 'Oreochromis aureus', 'Kryptolebias marmoratus', 'Oncorhynchus mykiss', 'Fundulus heteroclitus', 'Sander lucioperca', 'Amphiprion ocellaris', 'Neolamprologus brichardi', 'Electrophorus electricus', 'Oryzias latipes', 'Pangasianodon hypophthalmus', 'Esox lucius', 'Gymnodraco acuticeps', 'Thalassophryne amazonica', 'Pseudochaenichthys georgianus', 'Trematomus bernacchii', 'Periophthalmus magnuspinnatus', 'Epinephelus lanceolatus', 'Larimichthys crocea', 'Onychostoma macrolepis', 'Ameiurus melas', 'Anabas testudineus', 'Stegastes partitus', 'Mastacembelus armatus', 'Dissostichus mawsoni', 'Channa argus', 'Xiphophorus hellerii', 'Etheostoma spectabile', 'Scophthalmus maximus', 'Anarrhichthys ocellatus', 'Oncorhynchus kisutch', 'Clupea harengus', 'Chanos chanos', 'Triplophysa tibetana', 'Archocentrus centrarchus', 'Sparus aurata', 'Gadus morhua', 'Takifugu flavidus', 'Sphaeramia orbicularis', 'Bagarius yarrelli', 'Salarias fasciatus', 'Myripristis m

In [14]:
### Script to take out all duplicates sequences and convert it into a new file with all unique sequences by name
original_file = unique_id_file
corrected_file = unique_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in unique_sequences_species:
        for record in records:
            if "[" in record.description:
                split_string = record.description.split("[", 1)
                substring = split_string[1]
                split_string = substring.split("]", 1)
                substring = split_string[0]
            if (sequence == substring):
                SeqIO.write(record, corrected, 'fasta')
                break

In [15]:
## Check output "unique_file" sequence uniqueness
file = unique_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)
if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  947
The numbers of the sequence:  103
No duplicate, all sequences in the file are unique


In [16]:
sequences_from_unique_file = [] ## Store all sequences in the list
with open(unique_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences_from_unique_file.append(record.description)

In [17]:
#Take out predicted
sequences_gene_name = []
for i in sequences_from_unique_file:
        if "PREDICTED" in i:
            continue
        else:
            if gene_name in i:
                split_string = i.split(" ", 1)
                substring = split_string[1]
                split_string = substring.split("[", 1)
                substring = split_string[0]
                if (len(substring) - len(gene_name) -1 == 0):
                    sequences_gene_name.append(i)

In [18]:
### Script to get the gene_name only file
original_file = unique_file
corrected_file = gene_name_only_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in sequences_gene_name:
        for record in records:
            if (sequence == record.description):
                SeqIO.write(record, corrected, 'fasta')
                break

In [19]:
## Check output "gene_name_only_file" sequence uniqueness
file = gene_name_only_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  432
The numbers of the sequence:  34
No duplicate, all sequences in the file are unique


In [20]:
## Take out sequence that longer than desier max length
max_length = max_wanted_length
original_file = gene_name_only_file
corrected_file = under_max_length_sequence_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        if (len(record.seq)<max_length):
            SeqIO.write(record, corrected, 'fasta')

In [21]:
## Check output "max_number_sequences_file" sequence uniqueness
file = under_max_length_sequence_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  393
The numbers of the sequence:  33
No duplicate, all sequences in the file are unique


In [22]:
## Take only the wanted number of sequences:
max_gene_count = max_number_sequences
original_file = under_max_length_sequence_file
corrected_file = max_number_sequences_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        if (max_gene_count>0):
            SeqIO.write(record, corrected, 'fasta')
            max_gene_count = max_gene_count - 1
        else: 
            break         

In [23]:
## Check final output file sequence uniqueness
file = max_number_sequences_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  393
The numbers of the sequence:  25
No duplicate, all sequences in the file are unique


In [24]:
##Remove unwanted files
##Can comment to get the file
import os
os.remove(unique_id_file )
os.remove(unique_file )
os.remove(gene_name_only_file )
os.remove(under_max_length_sequence_file)