In [1]:
import pandas as pd 
import pickle
import seaborn as sns
import altair as alt

In [10]:
# Data

df_melted_kmeans_data = pickle.load(open("../data/go_terms/df_melted_kmeans_data.p", "rb"))
df_melted_birch_data = pickle.load(open("../data/go_terms/df_melted_birch_data.p", "rb"))

df_all_merged_kmeans_data = pickle.load(open("../data/go_terms/df_all_merged_kmeans_data.p", "rb"))
df_all_merged_birch_data = pickle.load(open("../data/go_terms/df_all_merged_birch_data.p", "rb"))


In [11]:
condition_to_loci_file = "../data/new_sample2condition.txt"

# load mapping from sample to condition
with open(condition_to_loci_file,'r') as f:
    loci_to_ferm_run = str(f.readline()[1:(len(f.readline())-1)]).split(',')

    loci_tracker = dict(loci.replace('"', "").strip().split(':') for loci in loci_to_ferm_run)
    
conditions = list(set(map(str.strip, loci_tracker.values())))
conditions

['lowCu',
 'lowO2_slow_growth',
 'aa3_KO',
 'uMax',
 'slow_growth',
 'highCu',
 'highO2_slow_growth',
 'NoLanthanum',
 'crotonic_acid',
 'lowO2_fast_growth',
 'WT_control',
 'unknown',
 'medCu',
 'lowO2_low_iron_fast_growth',
 'MeOH',
 'WithLanthanum',
 'NoCu',
 'NO3_lowO2_slow_growth',
 'lowCH4',
 'LanzaTech']

# Parallel Coordinates Plot

In [13]:
def altair_pcoords(df,title,xorder='exp_condition_order'):
    # If more than 30 entries, make 2 columns
    col_num = 1 if len(df['locus_tag'].unique()) <=50 else 2
    
    highlight = alt.selection(type='single', on='mouseover',
                              fields=['desc_string'], nearest=True)
    
    selection = alt.selection_multi(fields=['desc_string'], bind='legend')


    base = alt.Chart(df.sort_values(xorder),
                     title=title
        ).encode(
        x=alt.X(f'{xorder}:N',
                sort=alt.EncodingSortField(field=f"{xorder}:N", op="count"),
                axis=alt.Axis(labelAngle=-45)
               ),
        y='mean:Q',
        color=alt.Color('desc_string:N',
                        legend=alt.Legend(title='Gene', 
                                          orient = 'right',
                                          labelLimit=0,
                                          columns=col_num,
                                          symbolLimit=200
                                         )),
        size=alt.value(100)        
    )
    

    lines = base.mark_line().encode(
        size=alt.condition((selection|highlight), alt.value(3), alt.value(1)),
        opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.5))
    ).add_selection(
        selection,
    ).properties(
         width=600,
         height=400
    ).interactive()
    
    
    points = base.mark_circle().encode(
        tooltip=['locus_tag','product','group','gene', 'go_terms',"start_coord", "end_coord", "length", "translation"],
        opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.2)),
        size=alt.condition((selection|highlight), alt.value(100), alt.value(3))
    ).add_selection(highlight)

    #return points + lines
    return lines + points

def check_cluster_expression(df, cols,cluster_id = 0,title = "cluster title", just_data = False):

    cols = list(conditions) 
    df['desc_string'] = df.apply(lambda row: f"{row['locus_tag']}|{row['gene']}|{row['product']}",axis=1)
    id_vars = ['locus_tag','desc_string','cluster_id', "gene", "product", "start_coord", "end_coord", "length", "translation", "group", "go_terms"]
    dfm = df.melt(id_vars=id_vars, value_vars=cols,var_name='exp_condition',value_name="mean")

    dfm
    # altair sort helper
    list_ordering = conditions
    dfm["exp_condition_order"] = pd.Categorical(dfm["exp_condition"], categories=list_ordering)
    
    if just_data: 
        return dfm
    else: 
        return altair_pcoords(dfm[dfm['cluster_id']==cluster_id],title)

In [14]:
check_cluster_expression(
    df_all_merged_kmeans_data, 
    conditions,
    8,
    "kmeans, cluster 8", 
    just_data= False)

# MDS - tSNE Cluster Plot

In [15]:
df_avg_log_ratio_scaled_tpm = pd.read_csv("../data/avg_log_ratio_scaled.csv")
df_avg_log_ratio_scaled_tpm

FileNotFoundError: [Errno 2] File data/avg_log_ratio_scaled.csv does not exist: 'data/avg_log_ratio_scaled.csv'