In [1]:
# IMPORTS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from scipy.cluster.hierarchy import linkage, cut_tree, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform

In [2]:
# bcolors
HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKCYAN = "\033[96m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

In [8]:
## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')


## Load the dataset
mrna_count_dir = working_dir + 'data/5xFAD_paper/expressionList.csv'
mrna_count = pd.read_csv(mrna_count_dir, index_col=0)

sample_info_selection_dir = working_dir + 'data/5xFAD_paper/sampleInfo.csv'
sample_info_selection = pd.read_csv(sample_info_selection_dir)

figures_dir = working_dir + 'results/WGCNA_figures_own'


## Make a subset to save RAM

subset_dataset_size = 1000
mrna_count = mrna_count.iloc[:, :subset_dataset_size] 

## RAM usage estimation in GB
RAM_estimate = (subset_dataset_size * subset_dataset_size * 8) / (1024**3)
print(f"The aproximated RAM to analyse this size of dataset is: {RAM_estimate} GB")


The aproximated RAM to analyse this size of dataset is: 0.007450580596923828 GB


In [9]:
## PREPROCESS
## Preprocessing: removing obvious outlier on genes and samples

print(f"{BOLD}{OKBLUE}Pre-processing...{ENDC}")

# Prepare and clean data
# Remove genes expressed under this cutoff number along samples

mrna_count_filtered = mrna_count


[1m[94mPre-processing...[0m


In [10]:
mrna_count_filtered

Unnamed: 0_level_0,ENSMUSG00000000003,ENSMUSG00000000028,ENSMUSG00000000031,ENSMUSG00000000037,ENSMUSG00000000049,ENSMUSG00000000056,ENSMUSG00000000058,ENSMUSG00000000078,ENSMUSG00000000085,ENSMUSG00000000088,...,ENSMUSG00000006389,ENSMUSG00000006390,ENSMUSG00000006392,ENSMUSG00000006395,ENSMUSG00000006398,ENSMUSG00000006403,ENSMUSG00000006411,ENSMUSG00000006412,ENSMUSG00000006418,ENSMUSG00000006423
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X4mo_cortex_F_5xFADHEMI_430,0.0,1.90,0.00,0.13,0.43,22.37,24.24,19.32,33.41,620.45,...,0.57,37.59,31.39,16.65,1.61,15.96,2.37,71.77,71.28,27.90
X4mo_cortex_F_5xFADHEMI_431,0.0,1.10,0.06,0.07,0.18,16.99,24.69,23.88,31.40,705.73,...,0.42,26.35,33.02,13.74,1.88,11.11,2.67,76.98,63.00,24.93
X4mo_cortex_F_5xFADHEMI_433,0.0,1.18,0.07,0.13,1.90,20.37,28.06,21.33,32.14,699.50,...,0.57,32.20,36.03,15.42,1.59,10.55,2.64,73.48,72.88,27.76
X4mo_cortex_F_5xFADHEMI_434,0.0,2.18,0.00,0.07,0.31,17.98,21.46,15.06,27.60,639.95,...,0.62,29.03,26.74,16.63,1.44,11.05,2.61,79.72,71.35,20.72
X4mo_cortex_F_5xFADHEMI_511,0.0,1.50,0.10,0.14,0.53,18.35,20.18,18.66,26.43,640.55,...,0.99,27.19,32.06,14.38,1.23,11.32,2.69,71.30,64.44,22.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X18mo_hipp_M_5xFADWT_301,0.0,1.84,0.00,0.12,0.77,14.35,23.54,11.47,20.20,952.08,...,0.07,14.54,33.99,11.32,0.68,5.87,4.48,92.40,53.64,24.85
X18mo_hipp_M_5xFADWT_566,0.0,1.76,0.00,0.04,1.50,14.68,30.73,13.05,31.17,911.92,...,0.09,17.60,38.13,10.67,1.13,6.46,4.52,86.40,51.36,24.98
X18mo_hipp_M_5xFADWT_641,0.0,1.57,0.17,0.09,1.69,19.41,30.77,18.64,21.64,763.88,...,0.26,20.08,38.66,10.37,0.82,7.14,4.92,93.06,68.66,26.34
X18mo_hipp_M_5xFADWT_643,0.0,1.00,0.15,0.13,0.48,14.79,34.09,19.82,19.33,809.72,...,0.21,15.32,37.00,10.35,1.27,4.48,4.01,141.01,59.67,23.81


In [11]:
## CLUSTERING
## Hierarchical cluster analysis on a set of dissimilarities and methods for analyzing it.

METHODS = ["single", "complete", "average", "weighted", "centroid"]
method = "average"  # from the pyWGCNA

distances = pdist(mrna_count_filtered.T, metric='correlation')
dendrogram = linkage(distances, method=method)

ValueError: The condensed distance matrix must contain only finite values.