# Data processing

#### 1. Check if all sequences are unique, and output a file with unique sequence
#### 2. Only take sequence with decription of "oxytocin receptor", output a file with only oxytocin receptor sequence in description
#### 3. If there is more than 25 sequences in the file, output a file with only 25 sequences

In [1]:
## Input:
gene_original_file = "./OXTR_primate.fasta"
gene_name = "oxytocin receptor"
max_number_sequences = 25
max_wanted_length = 400
## Output:
unique_id_file = "./OXTR_primate_unique_id.fasta"
unique_file = "./OXTR_primate_unique.fasta"
gene_name_only_file = "./OXTR_primate_only.fasta"
under_max_length_sequence_file = "./OXTR_primate_under_max.fasta"
max_number_sequences_file = "./OXTR_primate_edited.fasta" #Final output file
## Set default original file
original_file = gene_original_file

#### Every block of code has it own call for file name to help the reader keeps track of the file easier
#### The code will try to Parsing a fasta file without having to place it in memory 
#### For fast input and output, please use the code from "clade process"

### Support library

In [2]:
#libs
from Bio import SeqIO

### 1. Sequence uniqueness check

In [3]:
## Take the max length of the sequences
maxlen = 0 ## Max sequence length
total_sequence = 0 ## Counter for sequences
with open(original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the sequence: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

The maximum length of the sequence:  598
The numbers of the sequence:  95


In [4]:
## Check duplicate helper
def duplicate_check(a):
    if len(a) == len(set(a)):
        return False
    else:
        return True

In [5]:
## Check sequence uniqueness
sequences = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(original_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences.append(record.id)
        total_sequence = total_sequence + 1
        
if duplicate_check(sequences):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

No duplicate, all sequences in the file are unique


In [6]:
## Here is the code to get the duplicate sequence
def duplicate_get(input_list, duplicate_list):   
    for item in input_list:
        if input_list.count(item) > 1:
            duplicate_list.append(item)

In [7]:
duplicate_sequences = []
duplicate_get(sequences,duplicate_sequences)
duplicate_sequences = list(set(duplicate_sequences))
print("ID of Sequence that has duplicates: ")
print(duplicate_sequences)

ID of Sequence that has duplicates: 
[]


In [8]:
## take out duplicate and get unique sequences
unique_sequences =[]
[unique_sequences.append(item) for item in sequences if item not in unique_sequences]
print("ID of unique sequence: ")
print(unique_sequences)

ID of unique sequence: 
['NP_001341583.1', 'NP_001341584.1', 'NP_001341582.1', 'NP_001341585.1', 'NP_000907.2', 'XP_003927162.1', 'sp|P30559.2|OXYR_HUMAN', 'sp|P56494.1|OXYR_MACMU', 'XP_017357089.1', 'NP_001038197.1', 'XP_017820536.1', 'NP_001306474.1', 'XP_033066419.1', 'XP_033066418.1', 'XP_033066417.1', 'XP_033066416.1', 'XP_033066415.1', 'XP_032005459.1', 'XP_032005458.1', 'XP_032005457.1', 'XP_032005456.1', 'XP_032146435.1', 'XP_023074158.2', 'XP_023074157.2', 'XP_023074156.2', 'XP_023074155.2', 'XP_023074154.2', 'XP_021789912.2', 'XP_030865378.1', 'XP_030865377.1', 'XP_030865376.1', 'XP_030865375.1', 'XP_030783150.1', 'XP_030783149.1', 'XP_030783146.1', 'XP_030783144.1', 'XP_010368206.1', 'XP_030657673.1', 'XP_028699019.1', 'XP_025233450.1', 'XP_025233449.1', 'XP_025233448.1', 'XP_025233447.1', 'XP_025233446.1', 'XP_024647262.1', 'XP_011732511.1', 'XP_016795834.1', 'XP_001144020.1', 'XP_002813528.1', 'PNJ67985.1', 'PNI53102.1', 'XP_003785518.1', 'XP_012326503.1', 'XP_012326495.1'

In [9]:
### Script to take out all duplicates sequences and convert it into a new file with all unique sequences by ID
original_file = gene_original_file
corrected_file = unique_id_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in unique_sequences:
        for record in records:
            if (sequence == record.id):
                SeqIO.write(record, corrected, 'fasta')
                break

In [10]:
sequences_unique_id = [] ## Store all sequences in the list
with open(unique_id_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences_unique_id.append(record.description)

In [11]:
def get_name (listI,listO):
    for i in listI:
        if "[" in i:
            split_string = i.split("[", 1)
            substring = split_string[1]
            split_string = substring.split("]", 1)
            substring = split_string[0]
            listO.append(substring)

In [12]:
sequence_species = []
get_name(sequences_unique_id,sequence_species)

In [13]:
unique_sequences_species = []
[unique_sequences_species.append(item) for item in sequence_species if item not in unique_sequences_species]
print("name of unique sequence: ")
print(unique_sequences_species)

name of unique sequence: 
['Homo sapiens', 'Saimiri boliviensis boliviensis', 'Cebus imitator', 'Macaca mulatta', 'Callithrix jacchus', 'Macaca fascicularis', 'Trachypithecus francoisi', 'Hylobates moloch', 'Sapajus apella', 'Piliocolobus tephrosceles', 'Papio anubis', 'Gorilla gorilla gorilla', 'Rhinopithecus roxellana', 'Nomascus leucogenys', 'Theropithecus gelada', 'Macaca nemestrina', 'Pan troglodytes', 'Pongo abelii', 'Otolemur garnettii', 'Aotus nancymaae', 'Microcebus murinus', 'Rhinopithecus bieti', 'Saimiri sciureus', 'Aotus azarai', 'Saguinus midas', 'Leontopithecus rosalia', 'Callimico goeldii', 'Callithrix penicillata', 'Callithrix kuhlii', 'Callithrix geoffroyi', 'Callithrix sp.', 'Callithrix pygmaea', 'Cacajao calvus', 'Chiropotes chiropotes', 'Pithecia pithecia', 'Plecturocebus cupreus', 'Ateles belzebuth', 'Ateles geoffroyi', 'Alouatta caraya', 'Lagothrix poeppigii', 'Lagothrix lagotricha', 'Brachyteles hypoxanthus', 'Propithecus coquereli', 'Cercocebus atys', 'Colobus 

In [14]:
### Script to take out all duplicates sequences and convert it into a new file with all unique sequences by name
original_file = unique_id_file
corrected_file = unique_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in unique_sequences_species:
        for record in records:
            if "[" in record.description:
                split_string = record.description.split("[", 1)
                substring = split_string[1]
                split_string = substring.split("]", 1)
                substring = split_string[0]
            if (sequence == substring):
                SeqIO.write(record, corrected, 'fasta')
                break

In [15]:
## Check output "unique_file" sequence uniqueness
file = unique_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)
if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  598
The numbers of the sequence:  46
No duplicate, all sequences in the file are unique


In [16]:
sequences_from_unique_file = [] ## Store all sequences in the list
with open(unique_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences_from_unique_file.append(record.description)

In [17]:
#Take out predicted
sequences_gene_name = []
for i in sequences_from_unique_file:
        if "PREDICTED" in i:
            continue
        else:
            if gene_name in i:
                split_string = i.split(" ", 1)
                substring = split_string[1]
                split_string = substring.split("[", 1)
                substring = split_string[0]
                if (len(substring) - len(gene_name) -1 == 0):
                    sequences_gene_name.append(i)

In [18]:
### Script to get the gene_name only file
original_file = unique_file
corrected_file = gene_name_only_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for sequence in sequences_gene_name:
        for record in records:
            if (sequence == record.description):
                SeqIO.write(record, corrected, 'fasta')
                break

In [19]:
## Check output "gene_name_only_file" sequence uniqueness
file = gene_name_only_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  598
The numbers of the sequence:  40
No duplicate, all sequences in the file are unique


In [20]:
## Take out sequence that longer than desier max length
max_length = max_wanted_length
original_file = gene_name_only_file
corrected_file = under_max_length_sequence_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        if (len(record.seq)<max_length):
            SeqIO.write(record, corrected, 'fasta')

In [21]:
## Check output "max_number_sequences_file" sequence uniqueness
file = under_max_length_sequence_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  389
The numbers of the sequence:  39
No duplicate, all sequences in the file are unique


In [22]:
## Take only the wanted number of sequences:
max_gene_count = max_number_sequences
original_file = under_max_length_sequence_file
corrected_file = max_number_sequences_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        if (max_gene_count>0):
            SeqIO.write(record, corrected, 'fasta')
            max_gene_count = max_gene_count - 1
        else: 
            break         

In [23]:
## Check final output file sequence uniqueness
file = max_number_sequences_file
maxlen = 0
sequence_check = [] ## Store all sequences in the list
total_sequence = 0 ## Counter for sequences
with open(file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence_check.append(record.id)
        total_sequence = total_sequence + 1
        if (maxlen < len(record.seq)):
            maxlen = len(record.seq)
print("The maximum length of the unique sequence by id: ",maxlen)
print("The numbers of the sequence: ",total_sequence)

if duplicate_check(sequence_check):
    print("There are duplicates")
else:
    print("No duplicate, all sequences in the file are unique")

The maximum length of the unique sequence by id:  389
The numbers of the sequence:  25
No duplicate, all sequences in the file are unique
