<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Data-Growth" data-toc-modified-id="Data-Growth-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Growth</a></span></li><li><span><a href="#Hierarchical-Clustering" data-toc-modified-id="Hierarchical-Clustering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Hierarchical Clustering</a></span></li><li><span><a href="#PCA" data-toc-modified-id="PCA-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>PCA</a></span></li><li><span><a href="#Normalize-to-reference-conditions" data-toc-modified-id="Normalize-to-reference-conditions-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Normalize to reference conditions</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#ONLY-FOR-PRECISE-DATA" data-toc-modified-id="ONLY-FOR-PRECISE-DATA-5.0.1"><span class="toc-item-num">5.0.1&nbsp;&nbsp;</span>ONLY FOR PRECISE DATA</a></span></li></ul></li></ul></li></ul></div>

<font size="4">This is a template notebook for exploratory analysis on your organism's QC'ed dataset.</font>

In [1]:
import pandas as pd
import os
from os import path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
sns.set_style('ticks')

# Load data

In [3]:
organism = "Synechococcus_elongatus"

In [6]:
data_dir = path.join('/home/tahani/Documents/elongatus/data/1_iModulon')
os.listdir(data_dir)

['GO_annotations.csv',
 'GO_enrichments.csv',
 'precision_recall_plot.csv',
 'X_prime.csv',
 'M.csv',
 'iModulon_table.csv',
 'NtcA.pdf',
 'Precision_recall.csv',
 'kegg_mapping.csv',
 '3_log_tpm_normalized_ICA.csv',
 '.~lock.0_metadata_317.csv#',
 'imodulon_table_0.8.csv',
 '2_gene_info.csv',
 'sample_table_prime.csv',
 'A.csv',
 'component_stats.csv',
 '3_log_tpm_qc_ICA.csv',
 'gene_info_operon.csv',
 'gene_table.csv',
 '0_log_tpm.csv',
 '.~lock.M.csv#',
 '.~lock.0_log_tpm.csv#',
 'pnas_genes.csv',
 'selon_new.json',
 'M_0.8.csv',
 'A_0.8.csv',
 'gene_info.csv',
 '3_metadata_qc_ICA.csv',
 'TRN.csv',
 'sample_table.csv',
 'iModulon_tables',
 '.~lock.gene_info_operon.csv#',
 '.~lock.gene_info_new_use.csv#',
 'plot3.csv',
 'X.csv',
 'pie_cateogries.csv',
 'ppGpp_wt_rel_samples.csv']

In [7]:
DF_metadata = pd.read_csv(os.path.join(data_dir,'sample_table_prime.csv'),index_col=0)
DF_log_tpm = pd.read_csv(os.path.join(data_dir,'0_log_tpm.csv'),index_col=0)

DF_metadata.shape

(317, 62)

In [8]:
DF_log_tpm.shape

(2700, 317)

In [9]:
DF_metadata['ref_condition']

current
SRX063252               wt
SRX063253               wt
SRX063254               wt
SRX2356912    wt_1dusk_000
SRX2356913    wt_1dusk_000
                  ...     
SRX7119038       wt_normal
SRX7119039       wt_normal
SRX7119040       wt_normal
SRX7119041       wt_normal
SRX8650808              wt
Name: ref_condition, Length: 317, dtype: object

# Normalize to reference conditions

In [10]:
DF_metadata.project

current
SRX063252      RNAseq
SRX063253      RNAseq
SRX063254      RNAseq
SRX2356912       rpaA
SRX2356913       rpaA
               ...   
SRX7119038       DHAR
SRX7119039       DHAR
SRX7119040       DHAR
SRX7119041       DHAR
SRX8650808    RNAseq2
Name: project, Length: 317, dtype: object

In [11]:
project_exprs = []
for name,group in DF_metadata.groupby('project'):
    ref_cond = group.ref_condition.unique()
    
    # Ensure that there is only one reference condition per project
    assert(len(ref_cond) == 1)
    ref_cond = ref_cond[0]
    
    # Ensure the reference condition is in fact in the project
    assert(ref_cond in group.condition.tolist())
    
    # Get reference condition sample ids
    ref_samples = group[group.condition == ref_cond].index
    
    # Get reference condition expression
    ref_expr = DF_log_tpm[ref_samples].mean(axis=1)
    
    # Subtract reference expression from project
    project_exprs.append(DF_log_tpm[group.index].sub(ref_expr,axis=0))

DF_log_tpm_norm = pd.concat(project_exprs,axis=1)

In [12]:
DF_log_tpm_norm.head()

Unnamed: 0_level_0,SRX7119038,SRX7119039,SRX7119040,SRX7119041,SRX259777,SRX259778,SRX259779,SRX259780,SRX259781,SRX259782,...,SRX4105559,SRX4105560,SRX4105561,SRX4105562,SRX4105563,SRX4105564,SRX4105565,SRX4105566,SRX4105567,SRX4105568
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HKK26_RS00005,0.0,0.21182,-0.340956,0.897594,-0.289881,0.069071,0.22081,-0.02548,-0.353734,0.161761,...,2.974372,3.248056,2.937651,1.049121,1.933418,1.692057,2.469696,2.8349,2.940724,3.210861
HKK26_RS00010,0.0,0.837445,0.918481,0.275618,0.182292,-0.005015,-0.177277,0.361253,-1.051174,-0.964382,...,-3.524118,-2.874861,-1.223941,1.09715,-0.166723,-1.604495,-4.02166,-1.470318,-2.233886,-0.929678
HKK26_RS00015,0.0,0.485131,0.251544,-0.144028,-0.090748,0.562848,-0.4721,0.014403,-0.390883,0.495542,...,-4.009437,-3.415142,-3.296091,-4.556207,-4.124352,-4.210934,-3.895414,-3.808367,-3.35094,-3.65331
HKK26_RS00020,0.0,0.078771,-0.647875,0.98096,-0.068027,0.160992,-0.092965,-0.151076,-0.766445,-0.934767,...,0.580617,0.514203,0.452077,-3.6051,0.151764,0.359884,0.84312,0.954885,-0.087203,0.886827
HKK26_RS00025,0.0,0.026178,-0.925658,0.573557,-0.618613,0.323662,0.294951,-0.080843,-0.946594,0.137341,...,-2.44633,-0.22277,-2.44633,-2.44633,1.164969,1.941654,1.657787,0.487913,-2.44633,0.388025


Tahani Tuesday Oct 6, 4pm 

<font size=4>Uncomment this code to save the log_tpm_norm file</font>

In [13]:
DF_log_tpm_norm.to_csv(path.join(data_dir,'X_prime_normalized.csv'))

In [14]:
DF_log_tpm_norm.shape

(2700, 317)