In [2]:
# Install required libraries

#using Pkg
#Pkg.add("FASTX")
using FASTX
using Distributions

In [3]:
# Codify the chromosome

function codify_chromosome(msa)

    # initialize matrix
    chromosome = [[0 for j in 1:length(msa[1])] for i in 1:length(msa)] 
                            
    # codify each sequence in the alignment 
    for i in 1:length(msa)
        count = 0
        for j in 1:length(msa[i])
            if msa[i][j] == '-'
                chromosome[i][j] = count*(-1) 
            else
                count = count + 1
                chromosome[i][j] = count
            end
        end
    end
    
    return chromosome
end

#codify_chromosome(["ABCDEFGH--","AB----CDEF","ABCD----EF"])

codify_chromosome (generic function with 1 method)

In [None]:
# Download pdb files for structure to calculate the STRIKE score

In [22]:
# Crossover function
# TO Be implemented
function crossover(chromosome_string1, chromosome_coded1,chromosome_string2, chromosome_coded2)
    
    return chromosome_string1,chromosome_coded1
    
end

crossover (generic function with 1 method)

# Generating Initial Population
<hr>

## Function: add_initial_gaps
This function simply adds gaps to a sequence

### Input :
* Gap Count (No. of gaps to be added)
* Single Sequence

### Output :
* Sequence with gaps in it

<hr>

## Function: generate_single_chromosome
This function would take in the multiple sequences and will try to add some number of gaps to every sequence by following
the below mentioned instructions:
1] Find the sequence with maximum length in the multiple sequennces
2] Add gaps ranging from randomly 20-40% in this specific sequence
3] Since we have to make every sequence of same length, we would compute the number of gaps needed to add in every sequence
to make it similar to the sequence with maximum length.

### Input :
* Chromosome (Multiple Sequence)

### Output :
* Chromosome (Multiple Sequences with gaps added to them)

<hr>

## Function: generate_initial_population
This function generates a set of chromosomes after adding gaps to given initial sequence. After adding 20% of chromosomes with gaps in it, we would crossover those sequences in order to generate other 80% of chromosomes. This set of chromosomes would be our initial population.

### Input :
* Chromosome (Multiple Sequence)
* Population Count (Number of chromosomes for initial population)

### Output :
* TBD

In [30]:
function add_initial_gaps(gap_count,sequence)
    for i in 1:gap_count
        position  = rand(1:length(sequence))
        sequence = string(sequence[1:position],"-",sequence[position+1:length(sequence)])
    end
    return sequence
end

function generate_single_chromosome(msa)
    
    new_msa = []
    for i in 1:length(msa)
        push!(new_msa,"")
    end
    
    max_sequence_length = 0
    max_sequence_index = 1
    for sequence in msa
        if length(sequence)>max_sequence_length
            max_sequence_length = length(sequence)
        max_sequence_index+=1
        end
    end
    
    # Update Max Length Sequence
    
    gap_count = rand(Uniform(0.2,0.4)) * max_sequence_length
    new_msa[max_sequence_index] = add_initial_gaps(gap_count,msa[max_sequence_index])
    
    # Update Rest of the Sequences
    
    for i in 1:length(msa)
        if i==max_sequence_index
            continue
        end
        gap_counts = length(new_msa[max_sequence_index]) - length(msa[i])
        new_msa[i] = add_initial_gaps(gap_counts,msa[i])
    end
    return new_msa
end

function generate_initial_population(msa,population_count)        
    without_crossover = 0.2*population_count
    population_string = []
    population_coded = []
    for i in 1:without_crossover
        chromosome = generate_single_chromosome(msa)
        push!(population_string,chromosome)
    end
    
    for chromosome in population_string
        coded_chromosome = codify_chromosome(chromosome)
        push!(population_coded,coded_chromosome)
    end
    
    with_crossover = 0.8*population_count
    for i in 1:with_crossover
        random1 = Int(rand(1:without_crossover))
        random2 = Int(rand(1:without_crossover))
        crossover_string, crossover_coded = crossover(population_string[random1],population_coded[random1],population_string[random2],population_coded[random2])
        push!(population_string, crossover_string)
        push!(population_coded, crossover_coded)
    end
    return population_string, population_coded
end    

generate_initial_population(["ACGTTACGGG","AGGCTTTAGGCG","AGGCTATGCAGG"],100)

Any["AC--GTTACG---G-G", "AG-GC-TT-T-AGGCG", "AGG--CTA-T-GCAGG"]
Any["ACGTTA-C-GG---G", "AGG--CTTTAGGC-G", "AGGCTA-TG-CAG-G"]
Any["A-CG-TTA--C-GGG", "AGGC-TTTA-GGC-G", "AGGCTATGCA-G-G-"]
Any["ACG-T---T-ACGGG-", "AGGCT----TTAGGCG", "AG-GCT--ATGCAGG-"]
Any["A---CGTTAC--G-GG", "AGGC-TT-T-A-GGCG", "AGG-CT-ATG--CAGG"]
Any["A-CG-TT-ACGGG-", "AGGCTTTAG-GC-G", "AGGCTAT-GCAGG-"]
Any["ACG-TT--ACGG-G", "A-G-GCTTTAGGCG", "AGGCTA-TG-CAGG"]
Any["A-CGTTACGGG---", "A-GGCTTTAG-GCG", "AGGC-TATG-CAGG"]
Any["ACG-T-TA-CG-G-G-", "AGGCTTT-A-GGCG--", "AGGCT--AT-GC-AGG"]
Any["AC-GTTACG-G--G-", "AGG-CTTTAGGC--G", "AGGCTA-T-G-CAGG"]
Any["ACGT-T--ACGG--G", "AGGCTTTAG--GC-G", "AG-GCTATG--CAGG"]
Any["AC--GT-TACG--GG-", "AGGC--TTTA-GG-CG", "AG-G-C-TATGCAG-G"]
Any["ACGTT--AC-GG-G", "AGGC-TTTAGGCG-", "AGGCTATGCA--GG"]
Any["ACGT--TAC-GG--G", "AG-GCT-TTAG-GCG", "AGGCT-A-T-GCAGG"]
Any["A-CG-TTACGG-G--", "AG-GCTT-TAG-GCG", "AG-GCTA-TGCAG-G"]
Any["A--CGTTAC-GG--G", "AGGCT-TTA--GGCG", "AGG-CTATG-CA-GG"]
Any["AC-GTTACG-GG---"

(Any[Any["AC--GTTACG---G-G", "AG-GC-TT-T-AGGCG", "AGG--CTA-T-GCAGG"], Any["ACGTTA-C-GG---G", "AGG--CTTTAGGC-G", "AGGCTA-TG-CAG-G"], Any["A-CG-TTA--C-GGG", "AGGC-TTTA-GGC-G", "AGGCTATGCA-G-G-"], Any["ACG-T---T-ACGGG-", "AGGCT----TTAGGCG", "AG-GCT--ATGCAGG-"], Any["A---CGTTAC--G-GG", "AGGC-TT-T-A-GGCG", "AGG-CT-ATG--CAGG"], Any["A-CG-TT-ACGGG-", "AGGCTTTAG-GC-G", "AGGCTAT-GCAGG-"], Any["ACG-TT--ACGG-G", "A-G-GCTTTAGGCG", "AGGCTA-TG-CAGG"], Any["A-CGTTACGGG---", "A-GGCTTTAG-GCG", "AGGC-TATG-CAGG"], Any["ACG-T-TA-CG-G-G-", "AGGCTTT-A-GGCG--", "AGGCT--AT-GC-AGG"], Any["AC-GTTACG-G--G-", "AGG-CTTTAGGC--G", "AGGCTA-T-G-CAGG"]  …  Any["ACGT--TAC-GG--G", "AG-GCT-TTAG-GCG", "AGGCT-A-T-GCAGG"], Any["ACG-T-TA-CG-G-G-", "AGGCTTT-A-GGCG--", "AGGCT--AT-GC-AGG"], Any["AC-G-TTAC-GG-G", "AG-G-CTTTAGGCG", "A-GGCTATGCAG-G"], Any["ACGTTA-C-GG---G", "AGG--CTTTAGGC-G", "AGGCTA-TG-CAG-G"], Any["AC-GTTACG-G--G-", "AGG-CTTTAGGC--G", "AGGCTA-T-G-CAGG"], Any["AC--GT-TACG--GG-", "AGGC--TTTA-GG-CG", "AG-G-C-TATGCAG

In [65]:
# Helper functions


function remove_columns_with_all_gaps(sequences, codifications)
    """
    This function removes columns with all gaps from the alignments passed as an input. 
    
    """
    
    columns_with_all_gaps = []
    total_seq = length(sequences)
    
    for (index,nucleotide) in enumerate(sequences[1])
        if nucleotide == '-'
            column_with_all_gaps = true
            
            for i in 2:total_seq
                if sequences[i][index] != '-'
                    column_with_all_gaps = false
                    break
                end
            end
            
            if column_with_all_gaps
                append!(columns_with_all_gaps, index)
            end
        end
    end
    
    for i in 1:total_seq
        sequences[i] = join([c for (i,c) in enumerate(sequences[i]) if ~(i in columns_with_all_gaps)])
        codifications[i] = [c for (i,c) in enumerate(codifications[i]) if ~(i in columns_with_all_gaps)]
    end
                    
    return sequences, codifications
end
                                
#remove_columns_with_all_gaps(["ab---fg","ab---fg","ab-c-fg","ab---fg"], [[1,2,-2,-2,-2,3,4],[1,2,-2,-2,-2,3,4],[1,2,-2,3,-3,4,5],[1,2,-2,-2,-2,3,4]])


(["ab-fg", "ab-fg", "abcfg", "ab-fg"], [[1, 2, -2, 3, 4], [1, 2, -2, 3, 4], [1, 2, 3, 4, 5], [1, 2, -2, 3, 4]])

In [4]:
# Mutation function

In [6]:
# Three Objective functions: STRIKE, Totally conserved columns and Percentage of Non-gaps

In [7]:
# MOSAStrE function - This is the main function that implements proposed algorithm

In [14]:
# Read sequences from the input file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function read_input(filename)
    reader = open(FASTA.Reader, "input1.txt")
    
    identifiers = []
    sequences = []
    
    for record in reader
        push!(identifiers, string(FASTA.identifier(record)))
        push!(sequences, string(FASTA.sequence(record)))
    end
    close(reader)
    
    return identifiers, sequences
end
#read_input("input1.txt")


read_input (generic function with 1 method)

In [42]:
# Write output alignments to the "output1.txt" file
# Reference: https://biojulia.net/FASTX.jl/dev/manual/fasta/

function write_output()
    
end 

write_output (generic function with 1 method)