In [1]:
# Isabel Jiah-Yih Liao
# January 2024 
# Synteny main 

##############
### Notes: 
##############
# Please ensure the correct intake files are in the correct directories prior to running. 

# If you would like to include an orthofinder run, create a directory under your root directory 
# called 'orthofinder_output' and place the results in the folder. If you are running orthofinder 
# within the script, do not include this file, as orthofinder will not run if this file already exists. 

# Expected files: 
#
# root_directory (specified in settings) 
# ├── Synteny_main.ipynb
# ├── dependencies
# │   └── Synteny_functions.ipynb
# ├── input_data
# |   ├── gene_rows (directory for the .gtf files, doesn't need to be gene rows specifically)
# |   │   ├── Bfl_gene_rows.gtf
# |   │   ├── Cmu_gene_rows.gtf
# |   │   ├── Mme_gene_rows.gtf
# |   │   └── ...
# |   ├── genomes
# |   │   ├── Bfl.fna
# |   │   ├── Cmu.fna
# |   │   ├── Mme.fna
# |   │   └── ...
# |   └── proteomes
# |       ├── Bfl.fasta
# |       ├── Cmu.fasta
# |       ├── Mme.fasta
# |       └── ...
# └── orthofinder_output (OPTIONAL INPUT: can also be run from this script) 
#     └── Results_MmmDD (can be any name) 
#         ├── Orthogroups
#         │   ├── Orthogroups.tsv
#         │   └── ...
#         └── ... 
#
# Make sure all input files have a backup copy to avoid losing information. 

# Resulting directory after running the program: 
# root_directory
# ├── Synteny_main.ipynb
# ├── dependencies
# │   └── ...
# ├── input_data
# │   └── ...
# └── run_name 
#     ├── output
#     │   ├── Bfl_coordinates.gtf
#     │   ├── Bfl_karyotype.gtf
#     │   ├── Cmu_coordinates.gtf
#     │   ├── Cmu_karyotype.gtf
#     │   ├── Mme_coordinates.gtf
#     │   └── Mme_karyotype.gtf
#     └── run_files
#         ├── orthofinder_output
#         │   ├── Orthogroups
#         │   │   └── ...
#         │   └── ... 
#         └── run_proteomes
#             ├── Bfl.fasta
#             ├── Cmu.fasta
#             ├── Mme.fasta
#             └── ...

In [2]:
##########
## Preparation 
##########

# Import necessary dependencies
import subprocess 
import os
import re
import pickle
import pandas as pd
from Bio import SeqIO 
%run dependencies/Synteny_functions.ipynb

In [3]:
##########
## Settings
##########

# Set up directories 
# Path to directory for running (directory containing input files)
root_directory = './bryozoans' 

# List of codes for the respective species to include
species_codes = [ 'Bfl', 'Bst', 'Cmu', 'Cpa', 'Mme', 'Wsu']
run_name = 'bryozoa_full_test' 

# Location of your executable orthofinder 
orthofinder_path = "/home/isabel/src/OrthoFinder/orthofinder"
threads = 150

In [4]:
##########
## START
##########

# Create a Synteny object. Use .fasta for proteome file extensions rather than .fa. 
synteny = Synteny(root_directory, run_name, species_codes, proteome_ext = '.fasta')

File check complete
Directories built for run...
No orthofinder results found .


In [5]:
##########
## Creating the karyotype files
##########

# Read the genome file to find the chromosomes and their associated lengths. 
# Specify the number of chromosomes n in each species. The longest n scaffolds 
# in each genome will be taken as the chromosomes. 

# Tip: If you are uncertain about the number of chromosomes, set the karyotype values 
# higher and look at how the scaffolds are named. 

chromosomes = {'Bfl': 19, 'Bst': 11, 'Cmu': 8, 'Cpa': 12, 'Mme': 11, 'Wsu': 11 }
synteny.build_karyotype(chromosomes)

Reading karyotype files: this may take a moment...


Unnamed: 0,Chromosome,Length
0,OY755158.1,79350317
1,OY755159.1,77133738
2,OY755160.1,75932528
3,OY755161.1,67125504
4,OY755162.1,66617907
5,OY755163.1,66545008
6,OY755164.1,66307180
7,OY755165.1,65553477
8,OY755166.1,64042602
9,OY755167.1,61084640


In [6]:
# To format the karyotype file, pass a list containing the vales of each column. 
# 'Chromosome', 'Length', and 'SPECIES' are all keywords allowing their associated data to be pulled. 
# Any non-keywords in the 'columns' list will yield a column containing that value. 
# Optionally, pass a 'labels' variable to relabel the column names. 
columns = ['Chromosome', '1', 'Length', 'SPECIES', '12', '25252']
labels = ['Chr', 'Start', 'End', 'species', 'size', 'color'] 
synteny.clean_karyotype(columns, labels)

Unnamed: 0,Chr,Start,End,species,size,color
0,OY755158.1,1,79350317,Wsu,12,25252
1,OY755159.1,1,77133738,Wsu,12,25252
2,OY755160.1,1,75932528,Wsu,12,25252
3,OY755161.1,1,67125504,Wsu,12,25252
4,OY755162.1,1,66617907,Wsu,12,25252
5,OY755163.1,1,66545008,Wsu,12,25252
6,OY755164.1,1,66307180,Wsu,12,25252
7,OY755165.1,1,65553477,Wsu,12,25252
8,OY755166.1,1,64042602,Wsu,12,25252
9,OY755167.1,1,61084640,Wsu,12,25252


In [7]:
# Write the karyotype files to the output folder. 
synteny.write_karyotype()

In [8]:
# Read the proteomes into Python 
synteny.read_proteomes()

Reading proteomes...


[SeqRecord(seq=Seq('MVTDPLRALLGKGADFKWEQRQEKAFQEVKQLVLSEHPTQVQIPVNRQNCVRNE...NYX'), id='g312.t1', name='g312.t1', description='g312.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MSRKDNAVDGDKVSKYEQTLRRASNRPEVILCSFYPSTRVPAAQYGIIGERDTR...LV*'), id='g313.t1', name='g313.t1', description='g313.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MRLGWSTYKTDANTALHPYWDEKGHFPERGTESVVDKEKEKRERTMMNYNKRHR...LY*'), id='g314.t1', name='g314.t1', description='g314.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MGKEGLEHYIERLWARYDGDEASDEASDEASDEASDEASDEASDEASDEASDEA...GL*'), id='g315.t1', name='g315.t1', description='g315.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MERGVLQGFVDYVYGLVTAQRVSEVHVVAACSEAWTANPKEYICIQENHLGVPV...DG*'), id='g316.t1', name='g316.t1', description='g316.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MNYPFQYYALKVLRRPEMKLYQVNKESQRLERWLRASTGSLLKASGVKLYQVNK...KD*'), id='g317.t1', name='g317.t1', description='g317.t1', dbxrefs=[]),
 SeqRecord(seq=Seq('MYEYQDIVSLINHYPGHESLWYYRKWLVYQLSLQQESSCRLTVEGELELEGEDD...DD*'), id='g318.t1', na

In [9]:
# We are using Bfl for ALGs, but we want the gene names to still fit the format. This is a tool 
# which renames genes to a substring of the current name. This allows the gene ids to match the 
# annotations later on. However, it only needs to be run on Bfl or other similarily formatted 
# proteomes. 

# Given longer ids with some kind of a pattern, take only a portion of the id as the gene name. 
# For instance, if the id is XXX_XXXX_GENEID we want to keep only the third column, using '_' as 
# a delimiter. 
# If no delimiter is specified, '_' is assumed. 
synteny.proteome_id_trim('Bfl', position = 3)

[SeqRecord(seq=Seq('MMLARNPMPSRTRSLITAGDTTSTADYFASHNTASNRTLADPPSHVGLPPLKSQ...IVG'), id='BL09372', name='7948_Ea_BL09372', description='7948_Ea_BL09372', dbxrefs=[]),
 SeqRecord(seq=Seq('MATRLDRLFTLLDTGSTPVIRKSAALQIGQVQKLHPHELHNLLAKVLTFLRSDN...SLT'), id='BL24099', name='8096_H_BL24099', description='8096_H_BL24099', dbxrefs=[]),
 SeqRecord(seq=Seq('MRTSDHRRLIHRDVWPFSGPFLFGRCRPAGPGKGGLEERAPPPRGDAQGVGHPP...DKH'), id='BL02556', name='9923_F_BL02556', description='9923_F_BL02556', dbxrefs=[]),
 SeqRecord(seq=Seq('MTLTTGLSQPFRRLDKYPTLLKELDRHLEEGHPDRYDVQQAIPVYKNIAERKER...TNL'), id='BL13883', name='9315_N_BL13883', description='9315_N_BL13883', dbxrefs=[]),
 SeqRecord(seq=Seq('MNRDQVINFAPGPAKLPQEVLEQAQKDMLSYNNLGISVMEMSHRSSDFSKIINN...TNQ'), id='BL16740', name='3413_C2_BL16740', description='3413_C2_BL16740', dbxrefs=[]),
 SeqRecord(seq=Seq('MSGLHSKHGFKVEALKPKAGGKAVHGSAIPLILPNDLEYNVSTGAVEKVQGARR...LKH'), id='BL01007', name='5160_A1a_BL01007', description='5160_A1a_BL01007', dbxrefs=[]),
 SeqReco

In [10]:
# Modify select headers to add species code to gene names and remove description information. 
# This would yield more precise proteomes to feed into orthofinder. 
# Run this with either a list containing a subset of species to modify, or run without 
# any parameters to modify all proteomes in species_codes. 
synteny.proteome_add_species()
synteny.write_proteomes()

Proteome gene names modified for Bfl
Proteome gene names modified for Bst
Proteome gene names modified for Cmu
Proteome gene names modified for Cpa
Proteome gene names modified for Mme
Proteome gene names modified for Wsu
Bfl proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Bfl.fa.
Bst proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Bst.fa.
Cmu proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Cmu.fa.
Cpa proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Cpa.fa.
Mme proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Mme.fa.
Wsu proteomes successfully written to ./bryozoans/bryozoa_full_test/run_files/run_proteomes/Wsu.fa.


[SeqRecord(seq=Seq('MVTDPLRALLGKGADFKWEQRQEKAFQEVKQLVLSEHPTQVQIPVNRQNCVRNE...NYX'), id='Wsu|g312.t1', name='g312.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MSRKDNAVDGDKVSKYEQTLRRASNRPEVILCSFYPSTRVPAAQYGIIGERDTR...LV*'), id='Wsu|g313.t1', name='g313.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MRLGWSTYKTDANTALHPYWDEKGHFPERGTESVVDKEKEKRERTMMNYNKRHR...LY*'), id='Wsu|g314.t1', name='g314.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MGKEGLEHYIERLWARYDGDEASDEASDEASDEASDEASDEASDEASDEASDEA...GL*'), id='Wsu|g315.t1', name='g315.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MERGVLQGFVDYVYGLVTAQRVSEVHVVAACSEAWTANPKEYICIQENHLGVPV...DG*'), id='Wsu|g316.t1', name='g316.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MNYPFQYYALKVLRRPEMKLYQVNKESQRLERWLRASTGSLLKASGVKLYQVNK...KD*'), id='Wsu|g317.t1', name='g317.t1', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MYEYQDIVSLINHYPGHESLWYYRKWLVYQLSLQQESSCRLTVEGELELEGEDD...DD*'), id='Wsu|g318.t1', name='g318.t1', 

In [11]:
##########
## Running OrthoFinder
##########
# This only needs to be done once per set of species so keep it commented out to avoid running 
# it accidentally
synteny.run_orthofinder(orthofinder_path, threads)

['nohup', '/home/isabel/src/OrthoFinder/orthofinder', '-f', './bryozoans/bryozoa_full_test/run_files/run_proteomes', '-o', './bryozoans/bryozoa_full_test/run_files/orthofinder_output', '-t', '150']

OrthoFinder version 2.5.4 Copyright (C) 2014 David Emms

2024-01-19 09:28:36 : Starting OrthoFinder 2.5.4
150 thread(s) for highly parallel tasks (BLAST searches etc.)
16 thread(s) for OrthoFinder algorithm

Checking required programs are installed
----------------------------------------
Test can run "mcl -h" - ok
Test can run "fastme -i ./bryozoans/bryozoa_full_test/run_files/orthofinder_output/Results_Jan19/WorkingDirectory/SimpleTest.phy -o ./bryozoans/bryozoa_full_test/run_files/orthofinder_output/Results_Jan19/WorkingDirectory/SimpleTest.tre" - ok

Dividing up work for BLAST for parallel processing
--------------------------------------------------
2024-01-19 09:28:37 : Creating diamond database 1 of 6
2024-01-19 09:28:37 : Creating diamond database 2 of 6
2024-01-19 09:28:37 : Creati

In [12]:
synteny.incorporate_orthofinder()

In [13]:
# Get the single copy orthologues from the orthogroups file from orthofinder results. 
synteny.single_copy_orthologues() 

Unnamed: 0,Orthogroup,Bfl,Bst,Cmu,Cpa,Mme,Wsu
3794,OG0003794,Bfl|BL24099,Bst|g3299.t1,Cmu|g20770.t1,Cpa|g29337.t1,Mme|g1166.t1,Wsu|g824.t1
3795,OG0003795,Bfl|BL02556,Bst|g18397.t1,Cmu|g17124.t1,Cpa|g9366.t1,Mme|g13681.t1,Wsu|g16010.t1
3796,OG0003796,Bfl|BL13883,Bst|g12973.t2,Cmu|g25307.t1,Cpa|g23633.t1,Mme|g25764.t2,Wsu|g20045.t1
3797,OG0003797,Bfl|BL16740,Bst|g16271.t1,Cmu|g19511.t1,Cpa|g6068.t1,Mme|g18655.t1,Wsu|g11821.t1
3798,OG0003798,Bfl|BL14725,Bst|g11405.t1,Cmu|g18040.t1,Cpa|g2841.t1,Mme|g16154.t1,Wsu|g13954.t1
...,...,...,...,...,...,...,...
5070,OG0005070,Bfl|BL07670,Bst|g3564.t1,Cmu|g21355.t1,Cpa|g28532.t1,Mme|g1681.t1,Wsu|g918.t1
5071,OG0005071,Bfl|BL24248,Bst|g3733.t1,Cmu|g12936.t1,Cpa|g9834.t1,Mme|g11780.t1,Wsu|g5758.t1
5072,OG0005072,Bfl|BL17600,Bst|g2214.t1,Cmu|g10198.t1,Cpa|g23724.t1,Mme|g2806.t1,Wsu|g2085.t2
5073,OG0005073,Bfl|BL05536,Bst|g6089.t1,Cmu|g7630.t1,Cpa|g12058.t1,Mme|g12008.t1,Wsu|g7389.t1


In [14]:
# Read the gtf file to a python dataframe. This may take a moment. 
# Use the 'feature' parameter to filter for only a type of row, ex. 'CDS'
# Use the 'protein_id' parameter to search for protein_id in the annotation column
# Use the 'equivalence' parameter to define an alternative separator (ex. '=' rather than ' ')

# Tip: If you are uncertain about which annotation types or features to use, 
# run the line with no additional parameters first and examine the resulting dataframe. 
synteny.gtf_to_dataframe('Bfl')

Unnamed: 0,sequence,source,feature,start,end,score,strand,frame,attribute,annotation
0,Sc7u5tJ_1007,EVM,gene,847,3719,.,+,.,"gene_id ""BL25399""; copy_id ""BL25399_1""; covera...","gene_id ""BL25399""; copy_id ""BL25399_1""; covera..."
1,Sc7u5tJ_1014,EVM,gene,156,1489,.,+,.,"gene_id ""BL02338""; copy_id ""BL02338_1""; covera...","gene_id ""BL02338""; copy_id ""BL02338_1""; covera..."
2,Sc7u5tJ_1022,EVM,gene,3300,4921,.,+,.,"gene_id ""BL13249""; copy_id ""BL13249_1""; covera...","gene_id ""BL13249""; copy_id ""BL13249_1""; covera..."
3,Sc7u5tJ_1022,EVM,gene,7663,14871,.,+,.,"gene_id ""BL13248""; copy_id ""BL13248_1""; covera...","gene_id ""BL13248""; copy_id ""BL13248_1""; covera..."
4,Sc7u5tJ_1022,EVM,gene,15865,24273,.,+,.,"gene_id ""BL13247""; copy_id ""BL13247_1""; covera...","gene_id ""BL13247""; copy_id ""BL13247_1""; covera..."
...,...,...,...,...,...,...,...,...,...,...
28177,Sc7u5tJ_97,EVM,gene,9101,10795,.,-,.,"gene_id ""BL22972""; copy_id ""BL22972_1""; covera...","gene_id ""BL22972""; copy_id ""BL22972_1""; covera..."
28178,Sc7u5tJ_97,EVM,gene,11209,12522,.,-,.,"gene_id ""BL22971""; copy_id ""BL22971_1""; covera...","gene_id ""BL22971""; copy_id ""BL22971_1""; covera..."
28179,Sc7u5tJ_97,EVM,gene,18441,41685,.,+,.,"gene_id ""BL22970""; copy_id ""BL22970_1""; covera...","gene_id ""BL22970""; copy_id ""BL22970_1""; covera..."
28180,Sc7u5tJ_985,EVM,gene,155,2722,.,-,.,"gene_id ""BL04168""; copy_id ""BL04168_1""; covera...","gene_id ""BL04168""; copy_id ""BL04168_1""; covera..."


In [15]:
# Making the dataframes: 
synteny.gtf_to_dataframe('Bfl', annotation_type = 'gene_id',)
synteny.gtf_to_dataframe('Bst', annotation_type = 'transcript_id', feature = 'CDS')
synteny.gtf_to_dataframe('Cmu', annotation_type = 'transcript_id', feature = 'CDS')
synteny.gtf_to_dataframe('Cpa', annotation_type = 'transcript_id', feature = 'CDS')
synteny.gtf_to_dataframe('Mme', annotation_type = 'transcript_id', feature = 'CDS')
synteny.gtf_to_dataframe('Wsu', annotation_type = 'transcript_id', feature = 'CDS')

Unnamed: 0,sequence,source,feature,start,end,score,strand,frame,attribute,annotation
3,CAUYWC010000002.1,AUGUSTUS,CDS,570969,570987,0.34,-,1,"transcript_id ""g1.t1""; gene_id ""g1"";",g1.t1
30,CAUYWC010000002.1,AUGUSTUS,CDS,586843,586856,0.49,+,0,"transcript_id ""g2.t1""; gene_id ""g2"";",g2.t1
42,CAUYWC010000002.1,AUGUSTUS,CDS,799606,799660,0.99,-,1,"transcript_id ""g3.t1""; gene_id ""g3"";",g3.t1
54,CAUYWC010000002.1,AUGUSTUS,CDS,805524,805529,0.87,-,0,"transcript_id ""g4.t1""; gene_id ""g4"";",g4.t1
78,CAUYWC010000002.1,AUGUSTUS,CDS,834389,834774,0.81,-,2,"transcript_id ""g5.t1""; gene_id ""g5"";",g5.t1
...,...,...,...,...,...,...,...,...,...,...
744975,OY755168.1,AUGUSTUS,CDS,45752836,45752844,0.89,+,0,"transcript_id ""g21340.t1""; gene_id ""g21340"";",g21340.t1
744984,OY755168.1,AUGUSTUS,CDS,45880500,45880567,0.86,-,2,"transcript_id ""g21341.t1""; gene_id ""g21341"";",g21341.t1
744996,OY755168.1,AUGUSTUS,CDS,45904430,45904930,0.68,+,0,"transcript_id ""g21342.t1""; gene_id ""g21342"";",g21342.t1
745002,OY755168.1,AUGUSTUS,CDS,45904990,45905624,0.14,+,0,"transcript_id ""g21343.t1""; gene_id ""g21343"";",g21343.t1


In [16]:
# Modify the annotation names to include the species codes followed by a pipe
# This matches the proteome_add_species() function. 
synteny.gtf_add_species()

3             Wsu|g1.t1
30            Wsu|g2.t1
42            Wsu|g3.t1
54            Wsu|g4.t1
78            Wsu|g5.t1
              ...      
744975    Wsu|g21340.t1
744984    Wsu|g21341.t1
744996    Wsu|g21342.t1
745002    Wsu|g21343.t1
745014    Wsu|g21344.t1
Name: annotation, Length: 25115, dtype: object

In [17]:
synteny.species_data['Mme']['gene_rows']

Unnamed: 0,sequence,source,feature,start,end,score,strand,frame,attribute,annotation
3,CAJZBW010000001.1,AUGUSTUS,CDS,532,660,0.68,+,0,"transcript_id ""g1.t1""; gene_id ""g1"";",Mme|g1.t1
12,CAJZBW010000001.1,AUGUSTUS,CDS,1677,1883,0.68,+,0,"transcript_id ""g2.t1""; gene_id ""g2"";",Mme|g2.t1
24,CAJZBW010000001.1,AUGUSTUS,CDS,7458,7633,0.51,+,0,"transcript_id ""g3.t1""; gene_id ""g3"";",Mme|g3.t1
33,CAJZBW010000001.1,AUGUSTUS,CDS,9495,9749,0.94,+,0,"transcript_id ""g4.t1""; gene_id ""g4"";",Mme|g4.t1
39,CAJZBW010000001.1,AUGUSTUS,CDS,17803,17867,0.57,+,0,"transcript_id ""g5.t1""; gene_id ""g5"";",Mme|g5.t1
...,...,...,...,...,...,...,...,...,...,...
1013191,OU612075.1,AUGUSTUS,CDS,23357678,23358242,0.99,-,1,"transcript_id ""g25905.t1""; gene_id ""g25905"";",Mme|g25905.t1
1013209,OU612075.1,AUGUSTUS,CDS,23362280,23362404,0.6,+,0,"transcript_id ""g25906.t1""; gene_id ""g25906"";",Mme|g25906.t1
1013218,OU612075.1,AUGUSTUS,CDS,23364373,23364678,1,-,0,"transcript_id ""g25907.t1""; gene_id ""g25907"";",Mme|g25907.t1
1013224,OU612075.1,AUGUSTUS,CDS,23364881,23365243,0.99,-,0,"transcript_id ""g25908.t1""; gene_id ""g25908"";",Mme|g25908.t1


In [18]:
# Looking at both the single copy orthogroups and the gene annotations, there will
# sometimes be a slight 'mismatch' between the gene ids of these groups. For instance, 
# choosing transcript_id vs gene_id in the previous step would produce annotations with/without 
# a .t# suffix. While this discrepency will not significantly alter the results, it will
# cause difficulty in matching orthologous genes to their annotations. 
# 
# If this is an issue and a species has an extra suffix in the proteome used initially, 
# we can use the truncate_sco('Species_code', 'Suffix') function. Alternatively, the 
# truncate_gene_rows() function does the same thing for gene row. 
#
# This is unnecessary in the current example, but here is the syntax for how it would be run: 
# synteny.truncate_sco('Mme', '.')

In [19]:
# Stitch together the orthogroup results with the associated gtf details to get the coordinates 
# for each gene. 
synteny.merge_gtf()

Unnamed: 0,Wsu,sequence,source,feature,start,end,score,strand,frame,attribute,annotation
0,Wsu|g824.t1,OY755158.1,AUGUSTUS,CDS,10918680,10918690,1,+,0,"transcript_id ""g824.t1""; gene_id ""g824"";",Wsu|g824.t1
1,Wsu|g16010.t1,OY755165.1,AUGUSTUS,CDS,34398365,34398379,0.82,-,0,"transcript_id ""g16010.t1""; gene_id ""g16010"";",Wsu|g16010.t1
2,Wsu|g20045.t1,OY755167.1,AUGUSTUS,CDS,56807019,56807205,1,+,0,"transcript_id ""g20045.t1""; gene_id ""g20045"";",Wsu|g20045.t1
3,Wsu|g11821.t1,OY755162.1,AUGUSTUS,CDS,36510878,36510979,0.99,-,0,"transcript_id ""g11821.t1""; gene_id ""g11821"";",Wsu|g11821.t1
4,Wsu|g13954.t1,OY755164.1,AUGUSTUS,CDS,5598219,5598694,0.96,-,2,"transcript_id ""g13954.t1""; gene_id ""g13954"";",Wsu|g13954.t1
...,...,...,...,...,...,...,...,...,...,...,...
1247,Wsu|g918.t1,OY755158.1,AUGUSTUS,CDS,13057033,13057251,0.82,-,0,"transcript_id ""g918.t1""; gene_id ""g918"";",Wsu|g918.t1
1248,Wsu|g5758.t1,OY755160.1,AUGUSTUS,CDS,8082399,8082441,0.66,+,0,"transcript_id ""g5758.t1""; gene_id ""g5758"";",Wsu|g5758.t1
1249,Wsu|g2085.t2,OY755158.1,AUGUSTUS,CDS,41112718,41112780,1,-,0,"transcript_id ""g2085.t2""; gene_id ""g2085"";",Wsu|g2085.t2
1250,Wsu|g7389.t1,OY755160.1,AUGUSTUS,CDS,64387045,64387471,0.77,+,0,"transcript_id ""g7389.t1""; gene_id ""g7389"";",Wsu|g7389.t1


In [20]:
# Pass in the species code corresponding to the proteome with the ALG information. 
# The default parameters assume ids of the format index_alg_gene (ex. 9197_A1b_BL14725), 
# such that columns = ['index', 'alg', 'gene_id'] and delimiter = '_', however these can 
# all be changed as long as 'gene_id' and 'alg' are included. 
# 'mapping_add_species' is set to True by default and adds the species code followed by a | 
# prior to each id, but this option can also be changed through the intake parameters. 
synteny.parse_algs('Bfl')

Unnamed: 0,index,alg,gene_id
0,7948,Ea,Bfl|BL09372
1,8096,H,Bfl|BL24099
2,9923,F,Bfl|BL02556
3,9315,N,Bfl|BL13883
4,3413,C2,Bfl|BL16740
...,...,...,...
2356,3992,N,Bfl|BL26045
2357,1335,B1,Bfl|BL04267
2358,8287,D,Bfl|BL05536
2359,12843,I,Bfl|BL21913


In [21]:
synteny.trace_algs('Bfl') 

In [22]:
# # Create a 'trace' group based on which chromosome each gene appears on for a given species. 
# # This allows us to later colour genes using this grouping. 
# synteny.trace_chromosomes('Mme') 

In [23]:
# Starting from the second column, provide a list to define the columns of the output file. 
# If a given item in the list is a column of the orthogene_coords dataframe, the information 
# from that column will be used. Otherwise, the string will be repeated for the entire column. 
# The first column will be an index. 

# If the keyword 'TRACE' is used, the alg or chromosome trace will be included in the column. 
columns = ['Complete', 'sequence', 'start', 'end', 'TRACE'] 
synteny.clean_coords(columns)

Unnamed: 0,Complete,sequence,start,end,TRACE
0,Complete,OY755158.1,10918680,10918690,H
1,Complete,OY755165.1,34398365,34398379,F
2,Complete,OY755167.1,56807019,56807205,N
3,Complete,OY755162.1,36510878,36510979,C2
4,Complete,OY755164.1,5598219,5598694,A1b
...,...,...,...,...,...
1247,Complete,OY755158.1,13057033,13057251,J2
1248,Complete,OY755160.1,8082399,8082441,G
1249,Complete,OY755158.1,41112718,41112780,I
1250,Complete,OY755160.1,64387045,64387471,D


In [24]:
synteny.species_data['Bfl']['orthogene_coords']

Unnamed: 0,Bfl,sequence,source,feature,start,end,score,strand,frame,attribute,annotation
0,Bfl|BL24099,Sc7u5tJ_1552,EVM,gene,20259441,20295001,.,-,.,"gene_id ""BL24099""; copy_id ""BL24099_1""; covera...",Bfl|BL24099
1,Bfl|BL02556,Sc7u5tJ_1559,EVM,gene,2023441,2049604,.,+,.,"gene_id ""BL02556""; copy_id ""BL02556_1""; covera...",Bfl|BL02556
2,Bfl|BL13883,Sc7u5tJ_566,EVM,gene,21977322,22002718,.,+,.,"gene_id ""BL13883""; copy_id ""BL13883_1""; covera...",Bfl|BL13883
3,Bfl|BL16740,Sc7u5tJ_366,EVM,gene,2633405,2639698,.,+,.,"gene_id ""BL16740""; copy_id ""BL16740_1""; covera...",Bfl|BL16740
4,Bfl|BL14725,Sc7u5tJ_339,EVM,gene,24190631,24209724,.,-,.,"gene_id ""BL14725""; copy_id ""BL14725_1""; covera...",Bfl|BL14725
...,...,...,...,...,...,...,...,...,...,...,...
1247,Bfl|BL07670,Sc7u5tJ_1590,EVM,gene,24518767,24523958,.,+,.,"gene_id ""BL07670""; copy_id ""BL07670_1""; covera...",Bfl|BL07670
1248,Bfl|BL24248,Sc7u5tJ_1565,EVM,gene,10986510,11024484,.,-,.,"gene_id ""BL24248""; copy_id ""BL24248_1""; covera...",Bfl|BL24248
1249,Bfl|BL17600,Sc7u5tJ_1568,EVM,gene,33518158,33536473,.,-,.,"gene_id ""BL17600""; copy_id ""BL17600_1""; covera...",Bfl|BL17600
1250,Bfl|BL05536,Sc7u5tJ_1571,EVM,gene,4845665,4846864,.,+,.,"gene_id ""BL05536""; copy_id ""BL05536_1""; covera...",Bfl|BL05536


In [25]:
# Write the coordinate files to the output folder. 
synteny.write_coords()