In [1]:
import matplotlib.pyplot as plt

import scipy.stats as st
import numpy as np
import pdb
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram, cut_tree

In [2]:
import sys
sys.setrecursionlimit(12000)

In [3]:
raw=pd.read_csv('mouse.csv',sep=',').values
raw=raw[np.array([len(np.unique(x))>2 for x in raw])]   

In [4]:
N=len(raw)
data=raw.T[0:N]

In [23]:
loc='den/'
for method in ['single','average','complete','weighted','ward'][-1:]:
    # dendrogram
    print(method+'-dendrogram')
    fig,axs=plt.subplots(1,1)
    fig.set_figwidth(N/5,forward=True)
    fig.set_figheight(5,forward=True)
    Z=linkage(data, method, 'correlation')
    den=dendrogram(Z, color_threshold=0,ax=axs)
    axs.tick_params(axis='X',labelsize=20)
    fig.savefig(loc+method+'_dendogram.png',bbox_inches='tight')
    plt.close()
    
    # corr plot
    print(method+'-corr')
    fig,axs=plt.subplots(1,1)
    fig.set_figwidth(30,forward=True)
    fig.set_figheight(30,forward=True)
    axs.imshow(np.corrcoef(data[den['leaves']]),interpolation='nearest', cmap='seismic',vmin=-1,vmax=1)
    fig.savefig(loc+method+'_corr.png',bbox_inches='tight')
    plt.close()
    
    # counts histograms
    print(method+'-hist')
    heights=np.unique(np.linspace(.05,.8,15).round(2))
    numClust=[]
    fig, axs = plt.subplots(len(heights),2,dpi=50,tight_layout=True)   
    fig.set_figwidth(32,forward=True)
    fig.set_figheight(len(heights)*9,forward=True)
    bins=range(0,len(data)+1)
    for x in range(len(heights)):
        height=heights[x]
        cut=fcluster(Z,t=height,criterion='distance')
        
        clustOrd=np.argsort(cut)
        dData=data[clustOrd[np.searchsorted(cut[clustOrd],range(1,max(cut)+1))]]
        axs[x,0].imshow(np.corrcoef(dData),interpolation='nearest', cmap='seismic',vmin=-1,vmax=1)
        axs[x,0].set_title('height '+str(height))

        counts=np.unique(cut,return_counts=True)[1]
        axs[x,1].hist(counts,bins=bins)
        axs[x,1].set_title('height '+str(height))
        
    plt.close()
    fig.savefig(loc+method+'_counts_hist.png',bbox_inches='tight')


single-dendrogram
single-corr
single-hist




average-dendrogram
average-corr
average-hist
complete-dendrogram
complete-corr
complete-hist
weighted-dendrogram
weighted-corr
weighted-hist
centroid-dendrogram


ValueError: Method 'centroid' requires the distance metric to be Euclidean

In [24]:
# all off diag hist
fig,axs=plt.subplots(1,1)
fig.set_figwidth(7,forward=True)
fig.set_figheight(7,forward=True)
off_diag=np.corrcoef(data,rowvar=True)[np.triu_indices(N,1)].flatten()  
axs.hist(off_diag,bins=np.linspace(-1,1,100))
fig.savefig(loc+'full_off_diag.png',bbox_inches='tight')
plt.close()

In [16]:
clusters=fcluster(linkage(data, 'single', 'correlation'),t=.05,criterion='distance')
