# Trabalho de Bioinformática
- Ana Cristina Silva de Oliveira, 11965630
- Fernando Henrique Paes Generich, 11795342
- Vítor Amorim Fróis, 12543440

In [1]:
import pandas as pd
from Bio import Entrez, SeqIO, Seq
from tqdm import tqdm

## Leitura dos arquivos .gff3
Vamos selecionar todas as linhas em que a Strand é positiva:

In [35]:
filenames = [
  'TEAnnotationFinal_Helitron.gff3',
  'TEAnnotationFinal_LINE.gff3',
  'TEAnnotationFinal_LTR.gff3',
  'TEAnnotationFinal_MITE.gff3',
  'TEAnnotationFinal_SINE.gff3',
  'TEAnnotationFinal_TIR.gff3'
]

def process_file(filename: str) -> pd.DataFrame:
  df = pd.read_csv(f'data/{filename}', sep='\t', header=None)
  df.drop(df[ df[6] != '+' ].index , inplace = True)
  df = df.drop( [1, 5, 7, 8], axis=1)
  return df

te_df = pd.DataFrame()

# Juntando os dados em um único DataFrame
for f in filenames:
  te_df = pd.concat([te_df, process_file(f)], ignore_index=True)

# Renomeando colunas
te_df.rename(columns={0 : "Chr", 2: "Class", 3: "Start", 4: "End", 6: "Strand"}, inplace = True)

# Seleciona Chr numérico apenas
te_df = te_df[te_df['Chr'].astype(str).str.isdigit()]

te_df.head()

Unnamed: 0,Chr,Class,Start,End,Strand
0,7,Class II subclass 2/Helitron/Helitron,82856122,82857584,+
1,1,Class II subclass 2/Helitron/Helitron,239302471,239302834,+
2,8,Class II subclass 2/Helitron/Helitron,2261318,2261604,+
3,5,Class II subclass 2/Helitron/Helitron,124665404,124665608,+
4,9,Class II subclass 2/Helitron/Helitron,77998293,78002783,+


## Obtendo as sequências de cromossomos do NCBI
No total, são 10 cromossomos

In [5]:
Entrez.email = "fernando_gene@usp.br"

ind = 618874

allchromosomes = []

for i in tqdm(range(10)):
  gen_bank_term = "LR" + str(ind+i) + ".1"

  handle = Entrez.esearch(db="nucleotide", term=gen_bank_term, retmax="10")
  rec_list = Entrez.read(handle)
  handle.close()

  id_list = rec_list['IdList']
  handle = Entrez.efetch(db='nucleotide', id=id_list, rettype='fasta', retmode="text")
  recs = list(SeqIO.parse(handle, 'fasta'))
  handle.close()

  allchromosomes.append(recs[0])

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [09:06<00:00, 54.64s/it]


In [16]:
chromosome_dict = {
  'Number': [],
  'Sequence': []
}

for i in allchromosomes:
  chromosome_dict['Number'].append(i.id)
  chromosome_dict['Sequence'].append(i.seq)

chromosome_df = pd.DataFrame(chromosome_dict)

chromosome_df

Unnamed: 0,Number,Sequence
0,LR618874.1,"(T, C, A, T, G, G, C, T, A, T, T, T, T, C, A, ..."
1,LR618875.1,"(T, T, C, T, C, A, T, T, A, T, A, T, C, C, T, ..."
2,LR618876.1,"(C, C, T, A, A, A, C, C, C, T, A, A, A, C, C, ..."
3,LR618877.1,"(C, C, T, A, A, A, C, C, C, T, A, A, A, C, C, ..."
4,LR618878.1,"(C, T, A, A, A, C, C, T, A, A, A, C, A, T, C, ..."
5,LR618879.1,"(A, A, A, A, C, C, C, T, A, A, A, C, C, C, T, ..."
6,LR618880.1,"(C, T, A, A, A, A, C, C, C, T, A, A, A, C, C, ..."
7,LR618881.1,"(A, A, A, C, C, T, A, A, A, C, C, C, T, A, A, ..."
8,LR618882.1,"(G, T, C, G, C, T, C, A, T, G, G, C, T, A, T, ..."
9,LR618883.1,"(T, A, A, A, C, C, C, T, A, A, A, C, C, C, T, ..."


## Efetuando um Join entre os DataFrames 

In [37]:
def get_sequence(chromosome: int, start: int, end: int) -> Seq.Seq:
  return chromosome_df.Sequence[chromosome-1][start:end+1]

te_df['Sequence'] = te_df.apply(lambda x: get_sequence(int(x.Chr), int(x.Start), int(x.End)), axis=1)

Deleta linhas que possuem NaN

In [10]:
te_df = te_df.dropna()
te_df.head()

Unnamed: 0,Chr,Class,Start,End,Strand,Sequence
0,7,Class II subclass 2/Helitron/Helitron,82856122,82857584,+,AGCTTCGTCACCAGCTTTGCTCCGACCACCCTTTGTCCATACTAAC...
1,1,Class II subclass 2/Helitron/Helitron,239302471,239302834,+,TCAGGGTTGCTTCTTGGCGAAGACAGGGCCTCGGGCGAGCCAGAAA...
2,8,Class II subclass 2/Helitron/Helitron,2261318,2261604,+,CGCCCAAGCAGACGGTCACCATCAGCGAAGACCTCACTTCGCATGA...
3,5,Class II subclass 2/Helitron/Helitron,124665404,124665608,+,ATGCCAAGTCGTGTCAAACGACTTAGGGTAGGGGTCAACTTTCTCC...
4,9,Class II subclass 2/Helitron/Helitron,77998293,78002783,+,TTAGGTTATTTATATACTAGTTTATGTTGATGATATAATCATCACT...


Salva o Dataframe como `.csv`

In [None]:
te_df.to_csv('data/transposable_elements.csv')

Leitura do arquivo csv

In [9]:
te_df = pd.read_csv('data/tranposable_elements.csv', index_col=0)

Vamos criar arquivos fasta para inserir no MathFeature

In [44]:
sequences = []
for index, sequence in list(zip(te_df.index, te_df['Sequence'])):
    sequences.append(SeqRecord.SeqRecord(Seq.Seq(sequence), id=str(index)))