In [None]:
# Isabel Jiah-Yih Liao
# September 2024 
# Synteny main 

##############
### Notes: 
##############
# Please ensure the correct intake files are in the correct directories prior to running. 

# If you would like to include an orthofinder run, create a directory under your root directory 
# called 'orthofinder_output' and place the results in the folder. If you are running orthofinder 
# within the script, do not include this file, as orthofinder will not run if this file already exists. 

# Expected files: 
#
# root_directory (specified in settings) 
# ├── Synteny_main.ipynb
# ├── dependencies
# │   └── Synteny_functions.ipynb
# ├── input_data
# │   ├── gene_rows
# │   │   ├── sp1.gtf
# │   │   ├── sp2.gtf
# │   │   ├── sp3.gtf
# │   │   └── ...
# │   ├── genome
# │   │   ├── sp1.fna
# │   │   ├── sp2.fna
# │   │   ├── sp3.fna
# │   │   └── ...
# │   └── proteomes
# │       ├── sp1.fasta
# │       ├── sp2.fasta
# │       ├── sp3.fasta
# │       └── ...
# └── orthofinder_output (OPTIONAL INPUT: can also be run from this script) 
#     └── Results_MmmDD (can be any name) 
#         ├── Orthogroups
#         │   ├── Orthogroups.tsv
#         │   └── ...
#         └── ... 
#
# Make sure all input files have a backup copy to avoid losing information. 

# Resulting directory after running the program: 
# root_directory
# ├── Synteny_main.ipynb
# ├── dependencies
# │   └── ...
# ├── input_data
# │   └── ...
# └── run_name 
#     ├── output
#     │   ├── sp1_coordinates.tsv
#     │   ├── sp1_karyotype.txt
#     │   ├── sp2_coordinates.tsv
#     │   ├── sp2_karyotype.txt
#     │   ├── sp3_coordinates.tsv
#     │   └── sp3_karyotype.txt
#     └── run_files
#         ├── orthofinder_output
#         │   ├── Orthogroups
#         │   │   └── ...
#         │   └── ... 
#         └── run_proteomes
#             ├── sp1.fasta
#             ├── sp2.fasta
#             ├── sp3.fasta
#             └── ...

In [None]:
##########
## Preparation 
##########

# Import necessary dependencies
import subprocess 
import os
import re
import pickle
import pandas as pd
from Bio import SeqIO 
%run Synteny_functions.ipynb

In [None]:
##########
## Settings
##########

# Set up directories 
# Directory containing Synteny_main.ipynb, dependencies, and input data)
root_directory = './synteny' 

# List of codes for the respective species to include
species_codes = [ 'sp1', 'sp2', 'sp3']
run_name = 'synteny_finder' 

# Location of your executable orthofinder 
orthofinder_path = '/path/to/orthofinder'
threads = 150

In [None]:
##########
## START
##########

# Create a Synteny object. Use .fasta for proteome file extensions rather than .fa. 
synteny = Synteny(root_directory, run_name, species_codes, proteome_ext = '.fasta')

In [None]:
##########
## Creating the karyotype files
##########

# Read the genome file to find the chromosomes and their associated lengths. 
# Specify the number of chromosomes n in each species. The longest n scaffolds 
# in each genome will be taken as the chromosomes. 

# Tip: If you are uncertain about the number of chromosomes, set the karyotype values 
# higher and look at how the scaffolds are named. 

chromosomes = {'sp1': 19, 'sp2': 18, 'sp3': 18}
synteny.build_karyotype(chromosomes)

In [None]:
# To format the karyotype file, pass a list containing the vales of each column. 
# 'Chromosome', 'Length', and 'SPECIES' are all keywords allowing their associated data to be pulled. 
# Any non-keywords in the 'columns' list will yield a column containing that value. 
# Optionally, pass a 'labels' variable to relabel the column names. 
columns = ['Chromosome', '1', 'Length', 'SPECIES', '12', '25252']
labels = ['Chr', 'Start', 'End', 'species', 'size', 'color'] 
synteny.clean_karyotype(columns, labels)

In [None]:
# Write the karyotype files to the output folder. 
synteny.write_karyotype()

In [None]:
# # Read the proteomes into Python 
# synteny.read_proteomes()

In [None]:
# # Given longer ids with some kind of a pattern, take only a portion of the id as the gene name. 
# # For instance, if the id is XXX_XXXX_GENEID we want to keep only the third column, using '_' as 
# # a delimiter. 
# # If no delimiter is specified, '_' is assumed. 
# synteny.proteome_id_trim('sp1', position = 3)

In [None]:
# # Modify select headers to add species code to gene names and remove description information. 
# # This would yield more precise proteomes to feed into orthofinder. 
# # Run this with either a list containing a subset of species to modify, or run without 
# # any parameters to modify all proteomes in species_codes. 
# synteny.proteome_add_species()
# synteny.write_proteomes()

In [None]:
# ##########
# ## Running OrthoFinder
# ##########
# # This only needs to be done once per set of species so keep it commented out to avoid running 
# # it accidentally
# synteny.run_orthofinder(orthofinder_path, threads)

In [None]:
# synteny.incorporate_orthofinder()

In [None]:
# Get the single copy orthologues from the orthogroups file from orthofinder results. 
synteny.single_copy_orthologues() 

In [None]:
# Read the gtf file to a python dataframe. This may take a moment. 
# Use the 'feature' parameter to filter for only a type of row, ex. 'CDS'
# Use the 'protein_id' parameter to search for protein_id in the annotation column
# Use the 'equivalence' parameter to define an alternative separator (ex. '=' rather than ' ')

# Tip: If you are uncertain about which annotation types or features to use, 
# run the line with no additional parameters first and examine the resulting dataframe. 
synteny.gtf_to_dataframe('sp1')

In [None]:
# Making the dataframes. Change the parameters to fit your input data: 
synteny.gtf_to_dataframe('sp1', annotation_type = 'gene_id',)
synteny.gtf_to_dataframe('sp2', annotation_type = 'transcript_id', feature = 'CDS')
synteny.gtf_to_dataframe('sp3', annotation_type = 'transcript_id', feature = 'CDS')

In [None]:
# Modify the annotation names to include the species codes followed by a pipe
# This matches the proteome_add_species() function. 
synteny.gtf_add_species()

In [None]:
# View the gtf file information for a species
synteny.species_data['sp1']['gene_rows']

In [None]:
# Looking at both the single copy orthogroups and the gene annotations, there will
# sometimes be a slight 'mismatch' between the gene ids of these groups. For instance, 
# choosing transcript_id vs gene_id in the previous step would produce annotations with/without 
# a .t# suffix. While this discrepency will not significantly alter the results, it will
# cause difficulty in matching orthologous genes to their annotations. 
# 
# If this is an issue and a species has an extra suffix in the proteome used initially, 
# we can use the truncate_sco('Species_code', 'Suffix') function. Alternatively, the 
# truncate_gene_rows() function does the same thing for gene row. 
#
# This is often not necessary to run. 
# synteny.truncate_sco('sp1', '.')

In [None]:
# Stitch together the orthogroup results with the associated gtf details to get the coordinates 
# for each gene. 
synteny.merge_gtf()

In [None]:
# Create a 'alg' group based on which chromosome each gene appears on for a given species. 
# This allows us to later colour genes using this grouping. 
synteny.trace_chromosomes('sp1') 

In [None]:
# Starting from the second column, provide a list to define the columns of the output file. 
# If a given item in the list is a column of the orthogene_coords dataframe, the information 
# from that column will be used. Otherwise, the string will be repeated for the entire column. 
# The first column will be an index. 

# If the keyword 'ALG' is used, the alg or chromosome group will be included in the column. 
columns = ['annotation', 'sequence', 'start', 'end', 'ALG'] 
synteny.clean_coords(columns)

In [None]:
# View the gtf information of a species but only for the genes which have single-copy orthologs across the board 
synteny.species_data['sp1']['orthogene_coords']

In [None]:
# Write the coordinate files to the output folder. 
synteny.write_coords()