# 1. Set up working directory

In [1]:
!cd ~
!ls
!mkdir capstone_project
!cd capstone_project
!pwd

TP53_CDS.fa	  tp53_protein_seq.fa
capstone_project  tp53_reference_data_retrieval.ipynb
mkdir: cannot create directory ‘capstone_project’: File exists
/home/thalia/capstone_project


# 2. Retrieve TP3 Gene Information from Ensembl

In [2]:
!curl -s 'https://rest.ensembl.org/lookup/id/ENSG00000141510?expand=1' \
-H 'Content-type:application/json' | jq

[1;39m{[0m
  [1;34m"seq_region_name"[0m[1;39m:[0m [0;32m"17"[0m[1;39m,[0m
  [1;34m"display_name"[0m[1;39m:[0m [0;32m"TP53"[0m[1;39m,[0m
  [1;34m"logic_name"[0m[1;39m:[0m [0;32m"ensembl_havana_gene_homo_sapiens"[0m[1;39m,[0m
  [1;34m"strand"[0m[1;39m:[0m [0;39m-1[0m[1;39m,[0m
  [1;34m"end"[0m[1;39m:[0m [0;39m7687546[0m[1;39m,[0m
  [1;34m"db_type"[0m[1;39m:[0m [0;32m"core"[0m[1;39m,[0m
  [1;34m"species"[0m[1;39m:[0m [0;32m"homo_sapiens"[0m[1;39m,[0m
  [1;34m"biotype"[0m[1;39m:[0m [0;32m"protein_coding"[0m[1;39m,[0m
  [1;34m"Transcript"[0m[1;39m:[0m [1;39m[[0m
    [1;39m{[0m
      [1;34m"gencode_primary"[0m[1;39m:[0m [0;39m0[0m[1;39m,[0m
      [1;34m"logic_name"[0m[1;39m:[0m [0;32m"havana_homo_sapiens"[0m[1;39m,[0m
      [1;34m"display_name"[0m[1;39m:[0m [0;32m"TP53-203"[0m[1;39m,[0m
      [1;34m"seq_region_name"[0m[1;39m:[0m [0;32m"17"[0m[1;39m,[0m
      [1;34m"strand"[0m[1;39m:[

In [3]:
!curl -s 'https://rest.ensembl.org/lookup/id/ENSG00000141510?expand=1' \
-H 'Content-type:application/json' | jq -r '[.seq_region_name, .start, .end] | @tsv'

17	7661779	7687546


# 4. Download TP53 CDS from Ensembl (FASTA)

In [4]:
!curl -s 'https://rest.ensembl.org/sequence/id/ENST00000269305?type=cds' \
  -H 'Content-type:text/x-fasta' > TP53_CDS.fa

# 5. Translate to Protein Sequence with Biopython and Save to a FASTA file

In [5]:
from Bio import SeqIO

rec = SeqIO.read("TP53_CDS.fa", "fasta")
cds = rec.seq

# Check if sequence length is a multiple of 3
if len(cds) % 3 == 0:
    print("Sequence length is valid for translation.")
else:
    print("Warning: Sequence length not a multiple of 3. Trimming extra bases.")
    # Trim trailing bases that don't form a full codon
    cds = cds[:len(cds) - (len(cds) % 3)]

# Translate
protein = cds.translate(to_stop=True)
print(protein)

# Double-check to make sure this protein sequence is correct by cross-checking it with
# the protein sequence from UniProt - ID: P04637

# Save to FASTA file
with open("tp53_protein_seq.fa", "w") as f:
    f.write(">tp53_protein\n")
    f.write(str(protein) + "\n")

print("Protein sequence saved to tp53_protein_seq.fa")

Sequence length is valid for translation.
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD
Protein sequence saved to tp53_protein_seq.fa


# 6. Check contents of new file

In [6]:
!cat tp53_protein_seq.fa | head

>tp53_protein
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD


# Result: 

I now have a protein FASTA file ( tp53_protein_seq.fa ) derived from my TP53 cds FASTA which can now be used for 
downstream analyses like variant annotation and functional prediction.