## Imports:

In [3]:
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

import pandas as pd
pd.set_option('display.max_columns', 60)



## File paths:

In [7]:
cuffcmp_tracking_path = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_prep/2016-02-05_comprehensive_Gmm_transcript_set/Gisella2016_Morsitan_transcriptome/cuffcmp.tracking"
ortho_table_path = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_prep/vectorbase_orthologs_GMOY_GFUI.csv"

# Functions

In [34]:
def load_ortho_table(path):
    df = pd.read_csv(path, sep=',')
    
    df = df.iloc[:,0:2]
    df.columns = ["gene_id_external","gene_id_internal"]
    
    return df

def load_tracking_table(path):
    df = pd.read_csv(path, sep='\t', names=["tcons_id","xloc_id","gene_id_tx_id","class_code","info"])
    
    df['gene_id_external'] = df.gene_id_tx_id.apply(lambda i: i.split('|')[0] if i != '-' else i)
    columns = ["tcons_id","xloc_id","gene_id_external"]
    
    return df[columns].copy()

def combine_tracking_and_orthologs(tracking,orthologs):
    df = pd.merge(left=tracking, right=orthologs,
                  how='outer',
                  on="gene_id_external", left_on=None, right_on=None,
                  left_index=False, right_index=False,
                  sort=False, suffixes=('_x', '_y'), copy=True, indicator=False).fillna('-')
    return df

# Load files

In [35]:
tracking = load_tracking_table(cuffcmp_tracking_path)
tracking.head()

Unnamed: 0,tcons_id,xloc_id,gene_id_external
0,TCONS_00000001,XLOC_000001,-
1,TCONS_00000002,XLOC_000002,-
2,TCONS_00000003,XLOC_000003,-
3,TCONS_00000004,XLOC_000004,-
4,TCONS_00000005,XLOC_000005,GMOY000001


In [36]:
ortho_table = load_ortho_table(ortho_table_path)
ortho_table.head()

Unnamed: 0,gene_id_external,gene_id_internal
0,GMOY005425,GFUI009881
1,GMOY005426,GFUI009872
2,GMOY005427,GFUI009879
3,GMOY005428,GFUI009867
4,GMOY005430,GFUI009870


# Correctly combine tables

In [37]:
combined = combine_tracking_and_orthologs(tracking=tracking, orthologs=ortho_table)

In [38]:
combined.tcons_id.value_counts().head()

-                 211
TCONS_00045763     14
TCONS_00045764     14
TCONS_00051409     14
TCONS_00045775     14
Name: tcons_id, dtype: int64

In [39]:
combined.query(''' tcons_id == "TCONS_00045763" ''')

Unnamed: 0,tcons_id,xloc_id,gene_id_external,gene_id_internal
60558,TCONS_00045763,XLOC_022433,GMOY006454,GFUI023722
60559,TCONS_00045763,XLOC_022433,GMOY006454,GFUI027649
60560,TCONS_00045763,XLOC_022433,GMOY006454,GFUI028167
60561,TCONS_00045763,XLOC_022433,GMOY006454,GFUI046094
60562,TCONS_00045763,XLOC_022433,GMOY006454,GFUI033906
60563,TCONS_00045763,XLOC_022433,GMOY006454,GFUI022812
60564,TCONS_00045763,XLOC_022433,GMOY006454,GFUI005975
60565,TCONS_00045763,XLOC_022433,GMOY006454,GFUI044439
60566,TCONS_00045763,XLOC_022433,GMOY006454,GFUI031543
60567,TCONS_00045763,XLOC_022433,GMOY006454,GFUI032259


In [32]:
combined.shape

(53277, 4)

In [33]:
combined.drop_duplicates().shape

(53277, 4)

In [None]:
combined.to_csv