In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

import os

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 60)

from dask import dataframe as dd
from dask.dot import dot_graph
from dask.diagnostics import ProgressBar

# Files

In [3]:
k_nearest_1_ = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_runs/gmm_to_gff_testing_v2/get_nearest_k_features/snp_list_MS_1_with_p_value.nearest.bed"

ids_with_diff_expr_ = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_runs/gmm_to_gff_testing_v2/make_id_table_with_diff_expr/ids_with_diff_expr.csv"

# Functions

In [4]:
def load_k_nearest_bed_by_distance(path, distance, keep_cols=None, rename_cols=None):
    """"""
    headers = ["SNP_chrom",
           "SNP_start",
           "SNP_end",
           "feature_set_name",
           "chrom",
           "chromStart",
           "chromEnd",
           "name",
           "score",
           "strand",
           "thickStart",
           "thickEnd",
           "itemRgb",
           "blockCount",
           "blockSizes",
           "blockStarts",
           "distance"
          ]
    k_nearest = pd.read_csv(path, sep="\t", names=headers)

    filtered_by_d = k_nearest.query(""" abs(distance) <= {distance} """.format(distance=distance))
    
    if rename_cols is not None:
        assert isinstance(rename_cols,dict)
        filtered_by_d = filtered_by_d.rename(columns=rename_cols).copy()
    
    if keep_cols is not None:
        assert isinstance(keep_cols,list)
        return filtered_by_d[keep_cols]
    else:
        return filtered_by_d

def load_de_genes_tx(path, chunksize=1000):
    """"""
    de = pd.read_csv(path, chunksize=chunksize)
    return de


def join_k_nearest_with_de(knearest_paths, de_path, result_path):
    """"""
    
    if not isinstance(knearest_paths,list):
        knearest_paths =[knearest_paths]
    
    results = []
    
    for knearest in knearest_paths:
    
        kn_df = load_k_nearest_bed_by_distance(path=knearest,
                                               distance=0,
                                               keep_cols=["SNP_chrom","SNP_start","SNP_end","feature_set_name","proximal_id"],
                                               rename_cols={"name": "proximal_id"}
                                              )
        
        for name, group in kn_df.groupby('feature_set_name'):


            for chunk in load_de_genes_tx(path=ids_with_diff_expr_, chunksize=10000):

                chunk_x = group.merge(right=chunk, how='inner',
                         on=None, left_on="proximal_id", right_on="gene_id_internal",
                         left_index=False, right_index=False)
                results.append(chunk_x.copy())
    
    results = pd.concat(results)
    
    results.to_csv(result_path, index=False)
    

# Load inputs

In [19]:
k_nearest_1 = load_k_nearest_bed_by_distance(path=k_nearest_1_,
                                             distance=0,
                                             keep_cols=["SNP_chrom","SNP_start","SNP_end","feature_set_name","proximal_id"],
                                             rename_cols={"name": "proximal_id"}
                                            )
ids_with_diff_expr_chunk1 = load_de_genes_tx(path=ids_with_diff_expr_, chunksize=10).next()

In [20]:
k_nearest_1.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,proximal_id
17,Scaffold106,349186,349187,official_annotations,GFUI002898
136,Scaffold2,1652016,1652017,novel_mapped_tx,TCONS_00070877
156,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034486
157,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034487
158,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034488


In [21]:
# reshape_k_nearest(k_nearest_1)

In [22]:
k_nearest_1.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,proximal_id
17,Scaffold106,349186,349187,official_annotations,GFUI002898
136,Scaffold2,1652016,1652017,novel_mapped_tx,TCONS_00070877
156,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034486
157,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034487
158,Scaffold22,1291804,1291805,novel_mapped_tx,TCONS_00034488


In [23]:
ids_with_diff_expr_chunk1.head()

Unnamed: 0,de_id,xloc_id,tcons_id,gene_id_external,gene_id_internal,lg2_fc,p,fdr,comparison,program
0,EDGR|SG_0000001,XLOC_020808,TCONS_00042534,GMOY005976,GFUI020234,4.531178,2.718538e-81,3.3277619999999997e-77,SG,edger
1,EDGR|SG_0000001,XLOC_020808,TCONS_00042535,GMOY005976,GFUI020234,4.531178,2.718538e-81,3.3277619999999997e-77,SG,edger
2,EDGR|SG_0000001,XLOC_020808,TCONS_00042536,GMOY005976,GFUI020234,4.531178,2.718538e-81,3.3277619999999997e-77,SG,edger
3,EDGR|SG_0000002,XLOC_038272,TCONS_00078943,GMOY010998,GFUI030635,4.571477,3.4784630000000003e-75,2.128994e-71,SG,edger
4,EDGR|SG_0000003,XLOC_025906,TCONS_00052530,GMOY007362,GFUI035588,4.637093,5.930163e-72,2.419704e-68,SG,edger


# Join tables

### via DASK

In [10]:
# k_nearest_1_d = dd.from_pandas(k_nearest_1,npartitions=1)

In [11]:
# x = k_nearest_1_d.merge( right=ids_with_diff_expr, how='left',
#          on=None, left_on="official_annotations", right_on="gene_id_internal",
#          left_index=False, right_index=False)

In [12]:
# # Start a progress bar for all computations
# pbar = ProgressBar()
# pbar.register()

In [13]:
# df = x.compute()

In [14]:
# df.shape

### via pandas-chunks

In [33]:
out_file = "/tmp/test-chunks/first_feature_set.csv"

!rm $out_file



for name, group in k_nearest_1.groupby('feature_set_name'):
    print("Processing group: {g}".format(g=name))
    
    
    for chunk in load_de_genes_tx(path=ids_with_diff_expr_, chunksize=10000):
        
        with open(out_file, 'a+') as out:
            
            chunk_x = group.merge(right=chunk, how='inner',
                     on=None, left_on="proximal_id", right_on="gene_id_internal",
                     left_index=False, right_index=False)
            chunk_x.to_csv(out, header=False, index=False)
            
#             if i % 100 == 0:
#                 print(i)
                
#             i += 1
        

Processing group: novel_mapped_tx
Processing group: official_annotations


In [44]:
def join_k_nearest_with_de(knearest_paths, de_path, result_path):
    """"""
    
    if not isinstance(knearest_paths,list):
        knearest_paths =[knearest_paths]
    
    results = []
    
    for knearest in knearest_paths:
    
        kn_df = load_k_nearest_bed_by_distance(path=knearest,
                                               distance=0,
                                               keep_cols=["SNP_chrom","SNP_start","SNP_end","feature_set_name","proximal_id"],
                                               rename_cols={"name": "proximal_id"}
                                              )
        
        for name, group in kn_df.groupby('feature_set_name'):


            for chunk in load_de_genes_tx(path=ids_with_diff_expr_, chunksize=10000):

                chunk_x = group.merge(right=chunk, how='inner',
                         on=None, left_on="proximal_id", right_on="gene_id_internal",
                         left_index=False, right_index=False)
                results.append(chunk_x.copy())
    
    results = pd.concat(results)
    
    results.to_csv(result_path, index=False)
        

In [45]:
out_file = "/tmp/test-chunks/first_feature_set.csv"

!rm $out_file

join_k_nearest_with_de(knearest_paths=k_nearest_1_, de_path=ids_with_diff_expr_, result_path=out_file)