In [None]:
# Install required libraries

using Pkg
Pkg.add("FASTX")

using FASTX

In [38]:
# Codify the chromosome

function codify_chromosome(msa)

    # initialize matrix
    chromosome = zeros(Int64, (length(msa),length(msa[1])))
    
    # codify each sequence in the alignment 
    for i in 1:length(msa)
        count = 0
        for j in 1:length(msa[i])
            if msa[i][j] == '-'
                chromosome[i,j] = count*(-1) 
            else
                count = count + 1
                chromosome[i,j] = count
            end
        end
    end
    
    return chromosome
    
end
codify_chromosome(["ABCDEFGH--","AB----CDEF","ABCD----EF"])


3×11 Array{Int64,2}:
 0  1  2   3   4   5   6   7   8  -8  -8
 0  1  2  -2  -2  -2  -2   3   4   5   6
 0  1  2   3   4  -4  -4  -4  -4   5   6

In [None]:
# Download pdb files for structure to calculate the STRIKE score

In [2]:
function add_initial_gaps(gap_count,sequence)
    for i in 1:gap_count
        position  = rand(1:length(sequence))
        sequence = string(sequence[1:position],"-",sequence[position+1:length(sequence)])
    end
    return sequence
end

"""
function : generate_single_chromosome
Input : Chromosome (Multiple Sequence)
Output : Chromosome (Multiple Sequences with gaps added to them)
What it does?
This function would take in the multiple sequences and will try to add some number of gaps to every sequence by following
the below mentioned instructions:
1] Find the sequence with maximum length in the multiple sequennces
2] Add gaps ranging from randomly 20-40% in this specific sequence
3] Since we have to make every sequence of same length, we would compute the number of gaps needed to add in every sequence
to make it similar to the sequence with maximum length.
"""
function generate_single_chromosome(msa)
    
    new_msa = []
    for i in 1:length(msa)
        push!(new_msa,"")
    end
    
    max_sequence_length = 0
    max_sequence_index = 1
    for sequence in msa
        if length(sequence)>max_sequence_length
            max_sequence_length = length(sequence)
        max_sequence_index+=1
        end
    end
    
    # Update Max Length Sequence
    
    gap_count = rand(Uniform(0.2,0.4)) * max_sequence_length
    new_msa[max_sequence_index] = add_initial_gaps(gap_count,msa[max_sequence_index])
    
    # Update Rest of the Sequences
    
    for i in 1:length(msa)
        if i==max_sequence_index
            continue
        end
        gap_counts = length(new_msa[max_sequence_index]) - length(msa[i])
        new_msa[i] = add_initial_gaps(gap_counts,msa[i])
    end
    return new_msa
end

function generate_initial_population(msa,population_count)        
    without_crossover = 0.2*population_count
    population = []
    for i in 1:without_crossover
        chromosome = generate_single_chromosome(msa)
        push!(population,chromosome)
    end
end    

#generate_initial_population(["ACGTTACGGG","AGGCTTTAGGCG","AGGCTATGCAGG"],100)

In [58]:
# Helper functions


function remove_columns_with_all_gaps(sequences)
    """
    This function removes columns with all gaps from the alignments passed as an input. 
    
    """
    
    columns_with_all_gaps = []
    total_seq = length(sequences)
    
    for (index,nucleotide) in enumerate(sequences[1])
        if nucleotide == '-'
            column_with_all_gaps = true
            
            for i in 2:total_seq
                if sequences[i][index] != '-'
                    column_with_all_gaps = false
                    break
                end
            end
            
            if column_with_all_gaps
                append!(columns_with_all_gaps, index)
            end
        end
    end
    
    for i in 1:total_seq
        sequences[i] = join([c for (i,c) in enumerate(sequences[i]) if ~(i in columns_with_all_gaps)])
    end
                    
    return sequences
end
#remove_columns_with_all_gaps(["ab---fg","ab---fg","ab-c-fg","ab---fg"])


4-element Array{String,1}:
 "ab-fg"
 "ab-fg"
 "abcfg"
 "ab-fg"

In [3]:
# Crossover function

In [4]:
# Mutation function

In [6]:
# Three Objective functions: STRIKE, Totally conserved columns and Percentage of Non-gaps

In [7]:
# MOSAStrE function - This is the main function that implements proposed algorithm

In [14]:
# Read sequences from the input file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function read_input(filename)
    reader = open(FASTA.Reader, "input1.txt")
    
    identifiers = []
    sequences = []
    
    for record in reader
        push!(identifiers, string(FASTA.identifier(record)))
        push!(sequences, string(FASTA.sequence(record)))
    end
    close(reader)
    
    return identifiers, sequences
end
#read_input("input1.txt")


read_input (generic function with 1 method)

In [42]:
# Write output alignments to the "output1.txt" file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function write_output()
    
end 

write_output (generic function with 1 method)

In [43]:
replace("abcd--efgh","-"=>"")

"abcdefgh"

In [51]:
str = "abcdfe"
a = [3,4]
join([c for (i,c) in enumerate(str) if ~(i in a)])

"abfe"