# Consensus and Profile

## Read Sample Input and Output

In [1]:
from rosalind.string_algorithms import get_consensus_string, get_profile_matrix
from rosalind.read_files import read_text, write_text, read_fasta
import pandas as pd

In [18]:
sample_input = read_fasta('sample_input.txt')
print("Sample Input:\n", sample_input)

sample_output = read_text('sample_output.txt')+ "\n"
print("\nSample Output:\n",sample_output)

Sample Input:
    Identifier  Sequence
0  Rosalind_1  ATCCAGCT
1  Rosalind_2  GGGCAACT
2  Rosalind_3  ATGGATCT
3  Rosalind_4  AAGCAACC
4  Rosalind_5  TTGGAACT
5  Rosalind_6  ATGCCATT
6  Rosalind_7  ATGGCACT

Sample Output:
 ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6



## Solve Sample Problem

In [19]:
def solve_problem(input):
    dna_list = pd.DataFrame(input.Sequence.apply(list).tolist())
    consensus_string = get_consensus_string(dna_list)
    profile_matrix = get_profile_matrix(dna_list)
    
    return consensus_string, profile_matrix
        # dna_strings_to_df = pd.DataFrame(df_fasta.Sequence.apply(list).tolist())


In [20]:
my_sample_output = solve_problem(sample_input)
my_sample_output

('ATGCAACT',
    0  1  2  3  4  5  6  7
 A  5  1  0  0  5  5  0  0
 C  0  0  1  4  2  0  6  1
 G  1  1  6  3  0  1  0  0
 T  1  5  0  0  0  1  1  6)

In [21]:
def print_output(output, file_path = 'output.txt'):
    # Write to a text file
    with open(file_path, 'w') as file:
        file.write(output[0])
        file.write('\n')

        
        for nucleotide, row in output[1].iterrows():
            formatted_row = f"{nucleotide}: " + " ".join(map(str, row))
            file.write(formatted_row + '\n')
    # output_string = str(output)
    
    with open(file_path, 'r') as file:
        output_string = file.read()
    
    # write_text(output_string, file_path)
    
    print("Output String:\n",output_string)
        
    return output_string



In [22]:
print_output(my_sample_output, "my_sample_output.txt") == sample_output

Output String:
 ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6



True

## Run Real Input

In [24]:
real_input = read_fasta('rosalind_cons (1).txt')

print_output(solve_problem(real_input), "my_rosalind_cons_output.txt");

Output String:
 GTAAACCTAGCAGATGGGTCGCGGCGGTCCTGGAGGACCCTTAACCCCGAGGGTAATAGCAGCAGACAATCCAAAGGCACGCAGTTAAAGAGATCTGAGACCATACCTCGAGGGATGACGCAGTCAATACCCTATCCCCCTGGCCGCCTATACTGCGGCAGTTTGGCGTAAATACTGACAAATGTTCAACCTTAAAACGGCTAACAAGACAAATGCTATCATACACGACAAAAGCAGAACATACCCACCCAAAAAAAGAGCGTGCCATATGCTATCGCTCTGCCATCCCCCCAATCCAGCACGCATTATCTGTTCTGGGTTAATACGACCCACGGAGTAAACAACGAGTTTGGCTTGCGAAGAGACGCTTGGTACCTTAATTATGACCCCAAAGGGCGCGAGGAGAAGCGAAATTACACACAGACACCAGAAAGGTATACGTAGAAACGCCGAAGAAAGAACATAATAACTCCCATCCAGCATGACAAAATCTCGTAAAAACGATGGAATCCGTAATCGAGACAGGTACGAGCTTAGTTTGCACGTCATGTGGACGACTGGGTCATGACCTACCTAACATGGGATGCGCAGAACATCGAGTGCTAAGGCGATCCACGACGGTAAAAGACGATAGCAAAGAACGCTTAGTTCGGCGCCCTGTGACGAGCCATCGAGCGTCTCGTGAAAACTAGAGGGCGGGAGAGTTCCAAAGAGACAATAGAGGGCACTATCCCCGAAAACGGGCTCGAGTGAAACGTCAGTCGTGCTCTCTCGTCCGCTCAGCTCAATGCTGACGCACAGAATATACAACGAGCGCGTCATTGGATCAGACGGAGCTCCATCCCAACAGTATAACCAGACGACGGCGCCGGTACAAAAATTTAATTCACCCTACCTACGCAATATTTATAGCCCCGCGGCCCGAAATCACCAAATAGAACAGAATCGTGAGGAACGACCAAATGCCTACACCGTAACTTA
A:

## Solution

In [56]:
def get_consensus_and_profile_matrix(input_path = './sample_input.txt', output_path = 'nucleotide_frequencies.txt' ):
    
    dna_strings = read_fasta(input_path)
        
    dna_strings_to_df = pd.DataFrame(dna_strings.Sequence.apply(list).tolist())

    freq_df = pd.DataFrame(
            ((dna_strings_to_df == 'A').sum().values,
            (dna_strings_to_df == 'C').sum().values,
            (dna_strings_to_df == 'G').sum().values,
            (dna_strings_to_df == 'T').sum().values), index=['A', 'C', 'G', 'T'])




    # Write to a text file
    with open(output_path, 'w') as file:
        file.write(''.join(freq_df.idxmax().values) + '\n')

        
        for nucleotide, row in freq_df.iterrows():
            formatted_row = f"{nucleotide}: " + " ".join(map(str, row))
            file.write(formatted_row + '\n')

    print("Output written to nucleotide_frequencies.txt")

In [59]:
get_consensus_and_profile_matrix()

Output written to nucleotide_frequencies.txt


## Submit problem

In [62]:
get_consensus_and_profile_matrix(input_path = './rosalind_cons.txt', output_path = 'nucleotide_frequencies_submission.txt')

Output written to nucleotide_frequencies.txt
