In [2]:
# Install required libraries

#using Pkg
#Pkg.add("FASTX")
using FASTX
using Distributions

In [3]:
# Codify the chromosome

function codify_chromosome(msa)

    # initialize matrix
    chromosome = [[0 for j in 1:length(msa[1])] for i in 1:length(msa)] 
                            
    # codify each sequence in the alignment 
    for i in 1:length(msa)
        count = 0
        for j in 1:length(msa[i])
            if msa[i][j] == '-'
                chromosome[i][j] = count*(-1) 
            else
                count = count + 1
                chromosome[i][j] = count
            end
        end
    end
    
    return chromosome
end

#codify_chromosome(["ABCDEFGH--","AB----CDEF","ABCD----EF"])

codify_chromosome (generic function with 1 method)

In [None]:
# Download pdb files for structure to calculate the STRIKE score

In [135]:
# Crossover function
# Do not know how to deal with the those sequence where no matching elements are found

function get_cut_point(cut_elements,chromosome_coded)
    cut_point = 0
    for i in 1:length(chromosome_coded)
        if abs(chromosome_coded[i])==abs(cut_elements[1])
            cut_point = i
            break
        end
    end
    return cut_point
end

function crossover(chromosome_string1, chromosome_coded1,chromosome_string2, chromosome_coded2)
    minimum = min(length(chromosome_string1[1]), length(chromosome_string2[1]))
    cut_point = rand(2:minimum-1)

    diff1,diff2 = Inf,-Inf
    
    cut_points_string2 = []
    
    for i in 1:length(chromosome_string1)
        cut_elements =  [chromosome_coded1[i][cut_point],chromosome_coded1[i][cut_point+1]]
        cut_point2 = get_cut_point(cut_elements,chromosome_coded2[i])
        diff1 = min(diff1,cut_point2)
        diff2 = max(diff2,cut_point2)
        push!(cut_points_string2,cut_point2)
    end
    
    crossover_string = [[],[]]
    crossover_coded = [[],[]]
    
    for i in 1:length(chromosome_string1)
        cut_point2 = cut_points_string2[i]
        s1 = chromosome_string1[i][1:cut_point]
        s2 = chromosome_string2[i][cut_point2+1:length(chromosome_string2[i])]
        
        gap_count1 = Int(cut_point2 - diff1)
        gaps1 = "-"^gap_count1
        
        main_string1 = string(s1,gaps1,s2)        
        main_coded1 = codify_chromosome([main_string1])[1]
        
        s1 = chromosome_string2[i][1:cut_point2]
        s2 = chromosome_string1[i][cut_point+1:length(chromosome_string1[i])]

        gap_count2 = Int(diff2 - cut_point2)
        gaps2 = "-"^gap_count2

        main_string2 = string(s1,gaps2,s2)        
        main_coded2 = codify_chromosome([main_string2])[1]
        
        push!(crossover_string[1], main_string1)
        push!(crossover_coded[1], main_coded1)
        
        push!(crossover_string[2], main_string2)
        push!(crossover_coded[2], main_coded2)
    end
    crossover_string[1],crossover_coded[1] = remove_columns_with_all_gaps(crossover_string[1],crossover_coded[1])
    crossover_string[2],crossover_coded[2] = remove_columns_with_all_gaps(crossover_string[2],crossover_coded[2])

    return crossover_string,crossover_coded
    
end

#crossover(["ACGTT--AC-G-GG-", "A-GGCTTTAGG-CG-", "AG-G-CT-ATGCAGG"],[[1, 2, 3, 4, 5, -5, -5, 6, 7, -7, 8, -8, 9, 10, -10], [1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -10, 11, 12, -12], [1, 2, -2, 3, -3, 4, 5, -5, 6, 7, 8, 9, 10, 11, 12]],["AC--G--TTA-CGGG", "AGGC-T-T-TAGGCG", "A-GGCTATGC-A-GG"],[[1, 2, -2, -2, 3, -3, -3, 4, 5, 6, -6, 7, 8, 9, 10], [1, 2, 3, 4, -4, 5, -5, 6, -6, 7, 8, 9, 10, 11, 12], [1, -1, 2, 3, 4, 5, 6, 7, 8, 9, -9, 10, -10, 11, 12]])

crossover (generic function with 1 method)

# Generating Initial Population
<hr>

## Function: add_initial_gaps
This function simply adds gaps to a sequence

### Input :
* Gap Count (No. of gaps to be added)
* Single Sequence

### Output :
* Sequence with gaps in it

<hr>

## Function: generate_single_chromosome
This function would take in the multiple sequences and will try to add some number of gaps to every sequence by following
the below mentioned instructions:
1] Find the sequence with maximum length in the multiple sequennces
2] Add gaps ranging from randomly 20-40% in this specific sequence
3] Since we have to make every sequence of same length, we would compute the number of gaps needed to add in every sequence
to make it similar to the sequence with maximum length.

### Input :
* Chromosome (Multiple Sequence)

### Output :
* Chromosome (Multiple Sequences with gaps added to them)

<hr>

## Function: generate_initial_population
This function generates a set of chromosomes after adding gaps to given initial sequence. After adding 20% of chromosomes with gaps in it, we would crossover those sequences in order to generate other 80% of chromosomes. This set of chromosomes would be our initial population.

### Input :
* Chromosome (Multiple Sequence)
* Population Count (Number of chromosomes for initial population)

### Output :
* TBD

In [137]:
function add_initial_gaps(gap_count,sequence)
    gap_count = floor(gap_count)
    while gap_count!=0
        blank_count = Int(rand(1:gap_count))
        position  = rand(1:length(sequence))
        sequence = string(sequence[1:position],"-"^blank_count,sequence[position+1:length(sequence)])
        gap_count=gap_count-blank_count
    end
    return sequence
end

function generate_single_chromosome(msa)
    
    new_msa = []
    for i in 1:length(msa)
        push!(new_msa,"")
    end
    
    max_sequence_length = 0
    max_sequence_index = 1
    for sequence in msa
        if length(sequence)>max_sequence_length
            max_sequence_length = length(sequence)
        max_sequence_index+=1
        end
    end
    
    # Update Max Length Sequence
    
    gap_count = rand(Uniform(0.2,0.4)) * max_sequence_length
    new_msa[max_sequence_index] = add_initial_gaps(gap_count,msa[max_sequence_index])
    
    # Update Rest of the Sequences
    
    for i in 1:length(msa)
        if i==max_sequence_index
            continue
        end
        gap_counts = length(new_msa[max_sequence_index]) - length(msa[i])
        new_msa[i] = add_initial_gaps(gap_counts,msa[i])
    end
    return new_msa
end

function generate_initial_population(msa,population_count)        
    without_crossover = 0.2*population_count
    population_string = []
    population_coded = []
    for i in 1:without_crossover
        chromosome = generate_single_chromosome(msa)
        coded_chromosome = codify_chromosome(chromosome)
        chromosome,coded_chromosome = remove_columns_with_all_gaps(chromosome,coded_chromosome)
        push!(population_coded,coded_chromosome)
        push!(population_string,chromosome)
    end
        
    with_crossover = (0.8*population_count)/2
    for i in 1:with_crossover
        random1 = Int(rand(1:without_crossover))
        random2 = Int(rand(1:without_crossover))
        crossover_string, crossover_coded = crossover(population_string[random1],population_coded[random1],population_string[random2],population_coded[random2])
        population_string = vcat(population_string,crossover_string)
        population_coded = vcat(population_coded,crossover_coded)
    end
    return population_string, population_coded
end    

#generate_initial_population(["ACGTTACCCGGAAATTTTTTACGGG","ACGTTACCTGGGAATATTGTACAGG","GTGTAAGGTGGGAATATTTFAGAGG"],100)

generate_initial_population (generic function with 1 method)

In [115]:
# Helper functions


function remove_columns_with_all_gaps(sequences, codifications)
    """
    This function removes columns with all gaps from the alignments passed as an input. 
    
    """
    
    columns_with_all_gaps = []
    total_seq = length(sequences)
    
    for (index,nucleotide) in enumerate(sequences[1])
        if nucleotide == '-'
            column_with_all_gaps = true
            
            for i in 2:total_seq
                if sequences[i][index] != '-'
                    column_with_all_gaps = false
                    break
                end
            end
            
            if column_with_all_gaps
                append!(columns_with_all_gaps, index)
            end
        end
    end
    
    for i in 1:total_seq
        sequences[i] = join([c for (i,c) in enumerate(sequences[i]) if ~(i in columns_with_all_gaps)])
        codifications[i] = [c for (i,c) in enumerate(codifications[i]) if ~(i in columns_with_all_gaps)]
    end
                    
    return sequences, codifications
end
                                
#remove_columns_with_all_gaps(["ab---fg","ab---fg","ab-c-fg","ab---fg"], [[1,2,-2,-2,-2,3,4],[1,2,-2,-2,-2,3,4],[1,2,-2,3,-3,4,5],[1,2,-2,-2,-2,3,4]])


remove_columns_with_all_gaps (generic function with 1 method)

In [2]:
# Mutation function

function mutation(parent_sequence, parent_codification)
    child_sequence = []
    child_codification = []
    
    for i in 1:length(parent_codification)
        curr_codification = parent_codification[i]
        curr_sequence = parent_sequence[i]
        
        gaps = []
        gap_start = -1
        for j in 1:length(curr_sequence)
            if curr_sequence[j] == '-'
                if gap_start == -1
                    gap_start = j
                end
            else
                if gap_start != -1
                    push!(gaps,[gap_start,j-1])
                    gap_start = -1
                end
            end
        end
        if gap_start != -1
            push!(gaps,[gap_start,length(curr_sequence)])
        end
        
        gap_group = gaps[rand(1:length(gaps))]
        
        mutated_seq = join([c for (i,c) in enumerate(curr_sequence) if ~(i>=gap_group[1] && i<=gap_group[2])])
        mutated_codification = [c for (i,c) in enumerate(curr_codification) if ~(i>=gap_group[1] && i<=gap_group[2])]
                        
        target_position = rand(1:length(mutated_seq)+1)
        
        gap_sequence = join(['-' for i in gap_group[1]:gap_group[2]])
        
        if target_position == length(mutated_seq)+1
            mutated_seq = mutated_seq * gap_sequence
            gap_code = -1*abs(mutated_codification[length(mutated_codification)])
            mutated_codification = cat(mutated_codification,[gap_code for i in gap_group[1]:gap_group[2]], dims=1)
        elseif target_position == 1
            mutated_seq = gap_sequence * mutated_seq
            gap_code = 0
            mutated_codification = cat([gap_code for i in gap_group[1]:gap_group[2]], mutated_codification, dims=1)
        else
            mutated_seq = mutated_seq[1:target_position-1] * gap_sequence * mutated_seq[target_position:length(mutated_seq)]
            gap_code = -1*abs(mutated_codification[target_position-1])
            mutated_codification = cat(mutated_codification[1:target_position-1],[gap_code for i in gap_group[1]:gap_group[2]],mutated_codification[target_position:length(mutated_codification)],dims=1)          
        end
                                    
        push!(child_sequence, mutated_seq)
        push!(child_codification, mutated_codification)
    end
    
    child_sequence,child_codification = remove_columns_with_all_gaps(child_sequence, child_codification)
end
#mutation(["abcd---cd--ef-cde-", "abcd---cd--ef-cdef"], [[1,2,3,4,-4,-4,-4,5,6,-6,-6,7,8,-8,9,10,11,-11],[1,2,3,4,-4,-4,-4,5,6,-6,-6,7,8,-8,9,10,11,12]])


mutation (generic function with 1 method)

In [1]:
# Three Objective functions: STRIKE, Totally conserved columns and Percentage of Non-gaps

function compute_tc_score(sequences)
    no_of_aligned_columns = 0
    total_columns = length(sequences[1])
    is_aligned = true
    for i in 1:total_columns
        is_aligned = true
        first_residue= sequences[1][i]
        if  first_residue != '-'
            for j in 1:length(sequences)
                if first_residue != sequences[j][i]
                    is_aligned = false
                    break
                end
            end
            else is_aligned = false
        end
        if is_aligned 
            no_of_aligned_columns += 1
        end
    end
    score = (100.0 * no_of_aligned_columns) / total_columns
    return score 
end

# +1 for match and 0 for no match

function calc_sum_pair(sequences) 
    t = length(sequences)
    k = length(sequences[1])
    score = 0
    for i=1:t
        A = sequences[i]
        for j=i+1:t
            B = sequences[j]
            for idx = 1:k
                if A[idx] == B[idx] && A[idx] != '-'
                    score += 1
                end
            end
        end
    end
    return score
end

function calc_nogap_percentage(sequences)
    t = length(sequences)
    k = length(sequences[1])
    no_gaps = 0
    for i in 1:t
        for j in 1:k
            if sequences[i][j]!='-'
                no_gaps+=1
            end
        end
    end
    total = k*t
    no_gaps_score = (no_gaps/total)*100
    return no_gaps_score
end
    
#Test 
#sequences = ["AGGCTTT-A-C", "CCCAGTG-AT-","GGCFATT-AT-"]
#print(calc_nogap_percentage(sequences))

calc_nogap_percentage (generic function with 1 method)

In [7]:
# MOSAStrE function - This is the main function that implements proposed algorithm

In [14]:
# Read sequences from the input file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function read_input(filename)
    reader = open(FASTA.Reader, "input1.txt")
    
    identifiers = []
    sequences = []
    
    for record in reader
        push!(identifiers, string(FASTA.identifier(record)))
        push!(sequences, string(FASTA.sequence(record)))
    end
    close(reader)
    
    return identifiers, sequences
end
#read_input("input1.txt")


read_input (generic function with 1 method)

In [136]:
# Write output alignments to the "output1.txt" file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function write_output()
    
end 

write_output (generic function with 1 method)