# Set up plotting in the notebook

In [None]:
%matplotlib inline

In [None]:
# %matplotlib

# Imports

In [None]:
from __future__ import division

In [None]:
from bokeh.charts import Scatter, output_file, show
from bokeh.sampledata.autompg import autompg as df
from bokeh.models import HoverTool
from bokeh.models import GlyphRenderer
from bokeh.io import output_notebook

In [None]:
# output_notebook()

In [None]:
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import consensus_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, normalize, scale

from sklearn.feature_selection import SelectKBest

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
sns.set_context("poster")

# Load data

In [None]:
data = pd.read_excel("/home/gus/MEGAsync/zim/main/BCH/Projects/Naresh/RNAseq/genes_log3WH.xlsx")

In [None]:
data.head()

In [None]:
data_ = data[["gene_id","1WH1_FPKM","1WK1_FPKM","3WH1_FPKM","3WK1_FPKM","12WH2_FPKM","12WK2_FPKM",]]

# Transpose the data so we are PCA'ing the conditions

In [None]:
data_.head()

In [None]:
dataT_ = data_.T.drop('gene_id',axis=0)

In [None]:
dataT_

# PCA logic 

In [None]:
pca = PCA(n_components=4)
pca.fit(dataT_)

print pca.explained_variance_ratio_  
print sum(pca.explained_variance_ratio_)

# Prepare results and plot them

In [None]:
pca_plot_data = pd.DataFrame(pca.transform(dataT_), index=[dataT_.index], columns=["PC1","PC2","PC3","PC4"] )


In [None]:
pca_plot_data = pd.DataFrame(pca.transform(dataT_), index=[dataT_.index], columns=["PC1","PC2","PC3","PC4"] )

In [None]:
pca_plot_data.index

In [None]:
pca_plot_data.reset_index(inplace=True)
pca_plot_data = pca_plot_data.rename(columns={'index':'Treatment'})
pca_plot_data

In [None]:
pca_plot_data

In [None]:
# pca_plot_data[''] = 0
# pca_plot_data

In [None]:
g = sns.PairGrid(pca_plot_data, hue="Treatment",
                 hue_order=None, palette='Paired', 
                 hue_kws=None, 
                 vars=None, x_vars=None, y_vars=None,
                 diag_sharey=False, size=5, aspect=1, despine=True, dropna=True)
g = g.map(plt.scatter,s=200)
g = g.add_legend()
# g.savefig('/tmp/test.jpg')

# Volcano-like plots

- We do not have p-value information so we are plotting __absolute change in FPKM__ as the _y-axis_
- also, to avoid 'divide by zero' errors we will add a very small _pseudo-value_ to all FPKM measurements
    - the value of the pseudo-value will be 1e_X_, where _X_ is the order of magnitude of the smallest NON-ZERO value in the data.

## Add psuedo-values

In [None]:
# smallest numerical value that is not zero
data_[data_ != 0].min().min()

In [None]:
shift_by = 1e-07
data_shifted_ = data_.set_index('gene_id') + shift_by

In [None]:
data_shifted_.head()

## Calculate the fold_change and abs_diff for each time point

In [None]:
week1 = data_[["gene_id","1WK1_FPKM","1WH1_FPKM"]].set_index('gene_id').copy()
week1['fold_change'] = np.log2(data_shifted_['1WK1_FPKM'] / data_shifted_['1WH1_FPKM'])
week1['abs_diff'] = abs(data_shifted_['1WK1_FPKM'] - data_shifted_['1WH1_FPKM'])
week1 = week1.reset_index()

In [None]:
week1.head()

In [None]:
week3 = data_[["gene_id","3WK1_FPKM","3WH1_FPKM"]].set_index('gene_id').copy()
week3['fold_change'] = np.log2(data_shifted_['3WK1_FPKM'] / data_shifted_['3WH1_FPKM'])
week3['abs_diff'] = abs(data_shifted_['3WK1_FPKM'] - data_shifted_['3WH1_FPKM'])
week3 = week3.reset_index()

In [None]:
week12 = data_[["gene_id","12WK2_FPKM","12WH2_FPKM"]].set_index('gene_id').copy()
week12['fold_change'] = np.log2(data_shifted_['12WK2_FPKM'] / data_shifted_['12WH2_FPKM'])
week12['abs_diff'] = abs(data_shifted_['12WK2_FPKM'] - data_shifted_['12WH2_FPKM'])
week12 = week12.reset_index()

## Plot the figures

In [None]:
# set up colors
wk1_clr = "#1F78B3"
wk3_clr = "#36A12E"
wk12_clr = "#E31B1C"

# set up filtered data
week1_gte2 = week1.query("""abs(fold_change) >= 1""")
week3_gte2 = week3.query("""abs(fold_change) >= 1""")
week12_gte2 = week12.query("""abs(fold_change) >= 1""")

In [None]:
# set up the hover/tool-tips etc

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("fold_change", "@x_values"),
            ("abs_diff", "@y_values"),
            ("gene_id", "@gene_id"),
        ]
    )
    
tools = "pan,wheel_zoom,box_zoom,reset,resize,previewsave"

In [None]:
hover.tooltips

In [None]:
# p1 = Scatter(week1_gte2, x="fold_change", y="abs_diff", title="Week 1",
#             xlabel="log2(Fold Change)", ylabel="abs(FPKM difference)",
#             tools=tools,
#             color=wk1_clr)

# # manually add gene_id and original values to data_source
# s1 = p1.select(GlyphRenderer)[0].data_source
# s1.data['gene_id'] = week1_gte2.gene_id
# s1.data['1WH1_FPKM'] = week1_gte2['1WH1_FPKM']
# s1.data['1WK1_FPKM'] = week1_gte2['1WK1_FPKM']

# # add original values to hover table
# h1 = hover.clone()
# h1.tooltips = h1.tooltips + [('1WH1_FPKM','@1WH1_FPKM'),
#                              ('1WK1_FPKM','@1WK1_FPKM')]
# p1.add_tools(h1)

# show(p1)

In [None]:
# p3 = Scatter(week3_gte2, x="fold_change", y="abs_diff", title="Week 3",
#             xlabel="log2(Fold Change)", ylabel="abs(FPKM difference)",
#             tools=tools,
#             color=wk3_clr)

# # manually add gene_id and original values to data_source
# s3 = p3.select(GlyphRenderer)[0].data_source
# s3.data['gene_id'] = week3_gte2.gene_id
# s3.data['3WH1_FPKM'] = week3_gte2['3WH1_FPKM']
# s3.data['3WK1_FPKM'] = week3_gte2['3WK1_FPKM']

# # add original values to hover table
# h3 = hover.clone()
# h3.tooltips = h3.tooltips + [('3WH1_FPKM','@3WH1_FPKM'),
#                              ('3WK1_FPKM','@3WK1_FPKM')]
# p3.add_tools(h3)


# show(p3)

In [None]:
week12.head()

In [None]:
# p12 = Scatter(week12_gte2, x="fold_change", y="abs_diff", title="Week 12",
#             xlabel="log2(Fold Change)", ylabel="abs(FPKM difference)",
#             tools=tools,
#             color=wk12_clr)

# # manually add gene_id and original values to data_source
# s12 = p12.select(GlyphRenderer)[0].data_source
# s12.data['gene_id'] = week12_gte2.gene_id
# s12.data['12WH2_FPKM'] = week12_gte2['12WH2_FPKM']
# s12.data['12WK2_FPKM'] = week12_gte2['12WK2_FPKM']

# # add original values to hover table
# h12 = hover.clone()
# h12.tooltips = h12.tooltips + [('12WH2_FPKM','@12WH2_FPKM'),
#                                ('12WK2_FPKM','@12WK2_FPKM')]
# p12.add_tools(h12)

# show(p12)

In [None]:
week12_gte2[week12_gte2.gene_id == "ENSMUSG00000051439"]

# Gene lists

## Bicluster ALL genes

In [None]:
bicluster_clf = SpectralBiclustering(n_clusters=(100,3), 
                          method='bistochastic', n_components=6, 
                          n_best=3, svd_method='randomized', n_svd_vecs=None, 
                          mini_batch=False, init='k-means++', 
                          n_init=10, n_jobs=1, random_state=None)

m = 3

In [None]:
data_cls = data_.set_index(data_.gene_id.values).drop('gene_id',1)
data_cls.head()

In [None]:
data_cls_scl = data_cls.T.apply(lambda x: scale(x, axis=0)).T

In [None]:
bicluster_clf.fit(X=data_cls_scl)

In [None]:
fit_data = data_cls_scl.iloc[np.argsort(bicluster_clf.row_labels_),np.argsort(bicluster_clf.column_labels_)]

In [None]:
bicluster_clf.column_labels_

In [None]:
sns.set(rc={'figure.figsize': [1*m,6*m]})
sns.heatmap(fit_data.iloc[:,:], square=False, cbar=False, yticklabels=False);

## Week 1: log2(fold_change) >= 1

In [None]:
gte2_template = 'number of genes in week {week} with >= 2 fold change: {num}'

In [None]:
print gte2_template.format(week=' 1',num=len(week1_gte2))
print gte2_template.format(week=' 3',num=len(week3_gte2))
print gte2_template.format(week='12',num=len(week12_gte2))

In [None]:
data_deltas_ = data_.set_index('gene_id')

In [None]:
data_deltas_.head()

In [None]:
data_deltas_['wk1_fold'] = week1.set_index("gene_id").fold_change
data_deltas_['wk3_fold'] = week3.set_index("gene_id").fold_change
data_deltas_['wk12_fold'] = week12.set_index("gene_id").fold_change

In [None]:
data_deltas_.iloc[:,-3:].head()

In [None]:
wk3_gte2_full_data_ = data_.set_index('gene_id')

In [None]:
week3_gte2_top_100_fc = week3_gte2.sort_values(by='fold_change', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').iloc[:100,:]
week3_gte2_top_100_fc.tail()

In [None]:
week3_gte2_top_100_abs = week3_gte2.sort_values(by='abs_diff', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').iloc[:100,:]
week3_gte2_top_100_abs.tail()

In [None]:
week3_gte2_top_100_union = pd.concat([week3_gte2_top_100_fc,week3_gte2_top_100_abs]).drop_duplicates()

In [None]:
data_week3_gte2_top_100_union = pd.merge(left=data_, right=week3_gte2_top_100_union[['gene_id']], 
                                         how='inner', 
                                         on="gene_id", left_on=None, right_on=None, 
                                         left_index=False, right_index=False, 
                                         sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)

In [None]:
data_week3_gte2_top_100_union.head()

In [None]:
data_week3_gte2_top_100_union_SHIFTED = data_week3_gte2_top_100_union.set_index(data_week3_gte2_top_100_union.gene_id.values)
data_week3_gte2_top_100_union_SHIFTED = data_week3_gte2_top_100_union_SHIFTED.drop('gene_id',1) + shift_by

    
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM = data_week3_gte2_top_100_union_SHIFTED.copy()
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['1WH1_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['1WH1_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['1WK1_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['1WK1_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['3WH1_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['3WK1_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['3WK1_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['12WH2_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['12WH2_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM['12WK2_FPKM'] = data_week3_gte2_top_100_union_SHIFTED['12WK2_FPKM'] / data_week3_gte2_top_100_union_SHIFTED['3WH1_FPKM']


# data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM = np.log2(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM)

data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.head()

In [None]:
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM_path = "~/tmp/data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM"
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.to_csv(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM_path)
# data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM = pd.read_csv(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM_path, index_col=0)

In [None]:
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.head()

In [None]:
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.describe()

In [None]:
sns.heatmap(data=data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.sample(10), 
            vmin=None, vmax=None, cmap=None, 
            center=None, robust=False, annot=False, 
            fmt='.2g', annot_kws=None, linewidths=0, linecolor='white', 
            cbar=True, cbar_kws=None, cbar_ax=None, 
            square=False, ax=None, xticklabels=True, yticklabels=True, mask=None,);

In [None]:
data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.T

In [None]:
pca = PCA(n_components=3)
pca.fit(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.T)

print pca.explained_variance_ratio_  
print sum(pca.explained_variance_ratio_)

In [None]:
pca_plot_data_ = pd.DataFrame(pca.transform(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.T), index=[data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.T.index], columns=["PC1","PC2","PC3"] )
pca_plot_data_

In [None]:
pca_plot_data_.reset_index(inplace=True)
pca_plot_data_ = pca_plot_data_.rename(columns={'index':'Treatment'})
pca_plot_data_

In [None]:
pca_plt2 = sns.PairGrid(pca_plot_data_, hue="Treatment",
                 hue_order=None, palette='Paired', 
                 hue_kws=None, 
                 vars=None, x_vars=None, y_vars=None,
                 diag_sharey=False, size=5, aspect=1, despine=True, dropna=True)
pca_plt2 = pca_plt2.map(plt.scatter,s=200)
pca_plt2 = pca_plt2.add_legend();

In [None]:
sns.clustermap(data_week3_gte2_top_100_union_SHIFTED_vs_3WH1_FPKM.sample(5), 
               pivot_kws=None, 
               method='average', metric='euclidean', 
               z_score=0, standard_scale=None, 
               figsize=None, 
               cbar_kws=None, 
               row_cluster=True, col_cluster=True, 
               row_linkage=None, col_linkage=None, 
               row_colors=None, col_colors=None, mask=None
              );

In [None]:
# no_fold = ["3WK1_FPKM","3WH1_FPKM","abs_diff"]
# no_abs = ["3WK1_FPKM","3WH1_FPKM","fold_change"]

# sns.clustermap(data=week3_gte2.set_index("gene_id").sort_values(by="abs_diff",ascending=False)[no_abs].sample(200,random_state=1), 
#                pivot_kws=None, 
#                method='average', metric='euclidean', 
#                z_score=True, standard_scale=None, 
#                figsize=(13,13), 
#                cbar_kws=None, 
#                row_cluster=True, col_cluster=False, 
#                row_linkage=None, col_linkage=None, 
#                row_colors=None, col_colors=None, 
#                mask=None);

# Biclustering of expression profiles