# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob 
import os
import warnings
import networkx as nx
import matplotlib.colors as mcolors
from scipy.stats import pearsonr
from sklearn.covariance import GraphicalLassoCV,GraphicalLasso
from networkx.generators.community import gaussian_random_partition_graph
from sklearn.model_selection import ValidationCurveDisplay
import scipy.stats as stats
from scipy.stats import ttest_ind
import nibabel as nib 
# import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import glob as glob
import math

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.display.max_rows = None
pd.options.display.max_columns = None

plt.style.use('ggplot')

# Setup 
Seed the random number generator once, so the entire notebook is reproducible.

In [None]:
np.random.seed(3)

## Load data

Load all ADNI and NACC patients

In [None]:
comb_gmv = pd.read_csv("/data2/MRI_PET_DATA/graph/csvs/graph_gmv_volumes/parcellation_volumes_raw.csv")

Read in adni prog and adni stab for bootstrapping (these ones have already been normalized)

In [None]:
adni_gmv_prog_lr = pd.read_csv("/data2/MRI_PET_DATA/graph/csvs/graph_gmv_volumes/mike_gmv_adni_prog_norm.csv")
adni_gmv_stab_lr = pd.read_csv('/data2/MRI_PET_DATA/graph/csvs/graph_gmv_volumes/mike_gmv_adni_stab_norm.csv')

adni_gmv_stab_lr.head(3)

Some regions have zero volume, let's drop them

In [None]:
(adni_gmv_prog_lr == 0).sum().sort_values(ascending=False).head(5)

In [None]:
adni_gmv_prog_lr = adni_gmv_prog_lr.drop(['vol_lInfLatVen','vol_rInfLatVen','vol_lOC','vol_rOC'], axis=1)
adni_gmv_stab_lr = adni_gmv_stab_lr.drop(['vol_lInfLatVen','vol_rInfLatVen','vol_lOC','vol_rOC'],axis=1)

After this there are no more regions with zero volume:

In [None]:
zero_present = (adni_gmv_stab_lr == 0).any().any()
print(zero_present)

For now let's drop `RID`, `TIV`, and `Dataset`, we don't need that information to compute the graph.

In [None]:
adni_gmv_prog_lr = adni_gmv_prog_lr.drop(['RID','TIV','Dataset'], axis=1)
adni_gmv_stab_lr = adni_gmv_stab_lr.drop(['RID','TIV','Dataset'], axis=1)

## Combine left and right regions
The number of samples is only slightly larger than the number of features. Let's combine the measured values in corresponding left/right regions of the brain, which cuts the number of features in half and is biologically reasonable.

In [None]:
adni_gmv_prog_lr.shape

In [None]:
#function that combines the left and right volumes for an region 

def combine_regions(input_df):
    # Assuming the region names can be extracted from the column names
    regions = set(col.split('_')[1][1:] for col in input_df.columns if col.startswith('vol_'))

    # Dictionary to hold the sum of the left and right volumes for each region
    comb_regions = {}

    # Iterate through each region, summing the left and right volumes
    for region in regions:
        left_col = f'vol_l{region}'
        right_col = f'vol_r{region}'
        sum_col = f'vol_{region}'
        comb_regions[sum_col] = input_df[left_col] + input_df[right_col]

    # Create a new DataFrame using the dictionary
    combined_df = pd.DataFrame(comb_regions)
    
    return combined_df

# Usage:


adni_gmv_prog = combine_regions(adni_gmv_prog_lr)
adni_gmv_stab = combine_regions(adni_gmv_stab_lr)

adni_gmv_prog.head(3)

## Normalize the data
Center the data and scale them to have unit standard deviation

In [None]:
progs_norm = StandardScaler().set_output(transform='pandas').fit_transform(adni_gmv_prog)
stable_norm = StandardScaler().set_output(transform='pandas').fit_transform(adni_gmv_stab)

progs_norm.head(3)

In [None]:
#the output of standard scaler is a numpy array so need to reconvert it 
# progs_norm = pd.DataFrame(progs_norm, columns=adni_gmv_prog.columns)
# stable_norm = pd.DataFrame(stable_norm,columns=adni_gmv_stab.columns)
# print(progs_norm.columns)

Double checking there are no null values

In [None]:
progs_norm.isna().any().any()

## Create Bootstrap samples
We use bootstrapping (with replacement) before the graphical lasso cv. 

In [None]:
num_bootstrap_samples = 100

bootstrap_precision_matrices = []
bootstrap_samples = []

for _ in tqdm(range(num_bootstrap_samples)):
    
    #sample with replacement
    bootstrap_sample = progs_norm.sample(n=len(progs_norm),replace=True).sort_index().reset_index(drop=True)
    
    bootstrap_samples.append(bootstrap_sample)

Check for infinite and null values, the should both be false:

In [None]:
print(np.any(np.isinf(bootstrap_samples)))
print(bootstrap_sample.isna().any().any())

check datatypes 

if I give it alphas [1,10] it looks like it drops almost all of the connections 
   
How I can be evaluating how well my model is fitting the data?

2 matrices estimates (from graphical lasso and then the actual empirical) to do difference you could 

if I just want estimate of how well its performing 

- should we reset index?
- use the sklearn sampler?

## CV to select $L_1$ regularization

In [None]:
model = GraphicalLasso(alpha=0.1, max_iter=100, tol=1e-2, mode='cd',assume_centered=True,covariance=None) #if I predefine an alpha it will take less time 

model

It looks like the model is very sensitive to the choice of $\alpha$, let's see what the sparsity looks like as a function of it

In [None]:
alphas = np.geomspace(0.1,0.9,64)
precision_sparsities = []
covariance_sparsities = []

for alpha in alphas: 
    
    model = GraphicalLasso(alpha=alpha, max_iter=200, tol=1e-2, mode='cd', assume_centered=True)
    model.fit(progs_norm)

    precision_sparsities.append(np.count_nonzero(model.precision_)/(model.precision_.size))
    covariance_sparsities.append(np.count_nonzero(model.covariance_)/(model.covariance_.size))

Choosing $\alpha$ < 0.2 has no effect on the sparsity of the covariance matrix (there might still be some shrinkage though). We should choose the optimal value of $\alpha$ by looking at the cross validated performance of the estimator, and thinking about how much sparsity we want in the final graph.

In [None]:
plt.plot(alphas,precision_sparsities,label="Precision Matrix")
plt.plot(alphas,covariance_sparsities,label="Covariance Matrix")
plt.xlabel("alpha")
plt.ylabel("Fraction of nonzero entries")
plt.legend();

In [None]:
model.get_params()

In [None]:
ValidationCurveDisplay.from_estimator(estimator=model,
                                      X=progs_norm,
                                      y=None,
                                      cv=10,
                                      param_name='alpha',
                                      param_range=np.linspace(0.1,0.9,16),
                                      n_jobs=-1
                                     )

As expected, with increasing $\alpha$ our sparse approximation of the covariance matrix gets worse. 

- There's an inflection point around $\alpha = 0.5$, does it mean anything?
- At $\alpha=0.5$ the covariance matrix is still 90% filled, it might be a good idea to make it much more and then decrease alpha gradually if we cannot see any difference between the two groups.

# Train graphical model on bootstrapped sample 
find score on each num bootstrap sample 

In [None]:
models = []

for bootstrap_sample in tqdm(bootstrap_samples):
    model = GraphicalLasso(alpha=0.7,max_iter=150, tol=1e-3) #if I predefine an alpha it will take less time 
    
    model.fit(bootstrap_sample)
    models.append(model) 

In [None]:
bootstrap_sample.head()

In [None]:
#different metrics to calc 
#gpe, clus_coeff 
#density nx.density(G)
#eigenvector centrality nx.eigenvector_centrality(G) seems like its not a single value 
#assostivity coeff nx.degree_assortativity_coefficient(G) 
#diameter nx.diameter(G) -  Gives the diameter of the largest connected component in the graph, representing the longest shortest path between any pair of nodes. gave error 
#radius nx.radius - computes the radius of the largest connected compmenet gave error 
#number connected components nx.number_connected_components(G)

In [None]:
#graphical lasso cv not converging 
#can adjust the tolerance, regularization parameter alphas, change solver method, increase max iterations, or random seed 
#example 
#model = GraphicalLassoCV(tol=1e-3)  # Increase tolerance level
#alphas = [0.1, 1.0, 10.0]  # Example list of alpha values to search
#model = GraphicalLassoCV(alphas=alphas)

#model = GraphicalLassoCV(solver='graph_lasso')  # Try a different solver
#model = GraphicalLassoCV(max_iter=500)  # Increase maximum iterations
#model = GraphicalLassoCV(random_state=42)  # Set a random seed
#graphical lasso -  WITHOUT BOOTSTRAP  
#converges with 500 iterations 
#estimates the precision matrix of the data (inverse cov mat which models the partial correlations between vars)
#uses lasso penalty, during training the model iteratively updates the estimated precision matrix using current hyperparamter values until it converges to a set of values that minimize the loss function 
# model = GraphicalLassoCV(cv=5,max_iter=500) #cv=5 specifies 5-fold cross validation strategy to optimize model hyperparameters 
# model.fit(progs_norm)
#have 1 rv per node (volumes), find out the optimization details, theorteically estimating corr mat of vars 


#have data, then unconnected random var,s est cov mat, and this estimation has penalty to force very small numbers to be 0 (force some sparsity) then interpret the inverse cov matrix as an adjacency matrix, and then visualize as a graph 
#makes sense to trat as random var, they are connecte din some sense if they are correlated 

#drop the very small nunvbwers, l1 sets very small things to 0, choses hyper parater by cross validation 

In [None]:
#RuntimeWarning: invalid value encountered in subtract x = asanyarray(arr - arrmean) 
#check data for nan values 
nan_mask = progs_norm.isna()
nan_count = nan_mask.sum().sum()
print(nan_count)

#under the assumption the vars are gaussian and correlated oit is estimated cov matrix 
#plot histrogram (for one brain region)
#if you have random var that is sum of other random var, with finite variance, then CLT holds, if its the sum of many processed, then it's reasonably gaussian 


#for embeddings - sklearkn might have covariance betwen 2 random vars, use dot product of vectors instead of product 

In [None]:
#just for graph visualization, doesn't work for bootstrapped because there's more than one pop graph 
#node size based on connectivity visualization 
#assigns node labels based on the column id 
#this doesn't work because it's the bootstrap sample list 
# precision_matrix_p = model.precision_
# np.fill_diagonal(precision_matrix_p, 0) #removes self connections by replacing the diagnonal of matrix with 0 

# G_prog = nx.Graph(precision_matrix_p)
# #create a dictionary that maps old node labels to new node labels
# node_labels = {i: label for i, label in enumerate(progs_norm.columns)}

# #relabel the nodes in the graph using the dictionary
# G_prog = nx.relabel_nodes(G_prog, node_labels)

# # Assuming you already have precision_matrix_np and G_n_prog as in your code
# # Calculate the node degrees (connectivity strength)
# node_degrees = dict(G_prog.degree())

# # Scale the node sizes based on node degrees
# node_sizes = [20 * node_degrees[node] for node in G_prog.nodes()]

# # Create a colormap for node colors based on node degrees
# node_colors = list(node_degrees.values())

# # Draw the graph with node sizes and colors
# pos = nx.random_layout(G_prog)
# nx.draw(
#     G_prog,
#     pos,
#     with_labels=True,
#     font_size=7,
#     style="dotted",
#     node_color=node_colors,
#     cmap=plt.cm.Reds,
#     node_size=node_sizes,
#     alpha=0.8,
#     width=0.3,
# )

# plt.title("Progressive MCI Population Graphical Model NACC", fontsize=22)
# plt.show()

#the random layout changes, to have the same one run the layout finctiopn first, choose what I want and then try it later 
#circular one might be good or spectral (bc it uses info on the degree of each node aware of hubs)
#since these are random seed rnadom generator to generate same layout 
#generate positions once and then pass same poisition dictioary 
#network x is generating citionary of 2d positins to draw function 
#draw in circle (then spot patterns that way)

In [None]:
#calc metrics 
# terating through each bootsrap sample model, calculating the graph metrics, and then storing the metrics for eahc model in a dataframe 
metrics_data = []
#to get variance, 

#keep track of index with enumerate 
for model_num, model in enumerate(models,start=1):
    precision_matrix_p = model.precision_
    np.fill_diagonal(precision_matrix_p, 0) #removes self connections by replacing the diagnonal of matrix with 0 
    G_prog = nx.Graph(precision_matrix_p)
    node_labels = {i: label for i, label in enumerate(progs_norm.columns)}
    #relabel the nodes in the graph using the dictionary
    G_prog = nx.relabel_nodes(G_prog, node_labels)
    #now calculate the metrics and store them 
    gpe = nx.global_efficiency(G_prog) 
    clustering_coefficient = nx.average_clustering(G_prog)
    density = nx.density(G_prog)
    # eigen_cen = nx.eigenvector_centrality(G_prog)
    ass_coeff = nx.degree_assortativity_coefficient(G_prog)
    # diameter = nx.diameter(G_prog)
    # radius = nx.radius(G_prog)


    #create dictionary for current models data 
    metrics_dic = {
        'model_num': model_num,
        'gpe': gpe,
        'clus_coeff':clustering_coefficient,
        'density': density,
        'ass_coeff': ass_coeff
    }
    metrics_data.append(metrics_dic)


#turn the list of dictionaries with each model info into a df
metrics_data = pd.DataFrame(metrics_data)
metrics_data.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/full_bootstrap_models_prog_metrics.csv')
# print(metrics_data.dtypes)
print(metrics_data)

In [None]:
#print(metrics_data)
#calc variance for each metric (which is col for each) like for mean 
#plot the variance for each metric/col across diff num samples (training curve)
# sklearn has training curve display 
# do it on estimator (graph lasso cv and training set) but do it on pop_ggm_code without bootstrap (good to have on poster)
# on bootstrapped data SSE ()

In [None]:
#heatmap of precision matrix (precision matrix is the inverse of the covariance matrix)

# Assuming models is your list of models
model_num = 0  # Change this to get the precision matrix of other models
precision_matrix = models[model_num].precision_

# Set the diagonal to zero if desired
np.fill_diagonal(precision_matrix, 0)

# Create a heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(precision_matrix, cmap='coolwarm', annot=False, vmin=-1, vmax=.25)  # assuming values range from -1 to 1
plt.title(f'Precision Matrix of Model {model_num + 1}')
plt.show()
print(np.min(precision_matrix), np.max(precision_matrix))

#stronger parital correlation is red but what if 


In [None]:
#run bootstrap for stable df 
# np.random.seed(3)
stab_num_bootstrap_samples = 50
stab_bootstrap_precision_matrices = []
stab_bootstrap_samples = []
for x in range(stab_num_bootstrap_samples):
    #sample with replacement, randomstate =1 for reporoducibility 
    stab_bootstrap_sample = stable_norm.sample(n=len(stable_norm),replace=True)
    #append resamples df to a list 
    stab_bootstrap_samples.append(stab_bootstrap_sample)

In [None]:
# #bootstrap and plot stable 
# #bootstrap for stable adni 
# stab_num_bootstrap_samples = 5 
# stab_bootstrap_precision_matrices = []
# stab_bootstrap_samples = []
# for x in range(stab_num_bootstrap_samples):
#     #sample with replacement, randomstate =1 for reporoducibility 
#     stab_bootstrap_sample = stable_norm.sample(n=len(stable_norm),replace=True, random_state=1)
#     #append resamples df to a list 
#     stab_bootstrap_samples.append(stab_bootstrap_sample)

In [None]:
#run graphical model on bootstrapped sample 
stab_models = []
for stab_bootstrap_sample in tqdm(stab_bootstrap_samples):
    #if I give it alphas [1,10] it looks like it drops almost all of the connections 
    
    stab_model = GraphicalLassoCV(cv=5,max_iter=50,tol=1e-3)
    stab_model.fit(stab_bootstrap_sample)
    stab_models.append(stab_model) 

In [None]:
# stab_models.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/full_bootstrap_models_stab.csv')
#this doesn't work because the bootstrap is a list 

In [None]:
#check that the bootstrap values are unique 
stab_bootstrap_samples[1].index.nunique()

In [None]:
# #plot graph just for viualization doesn't work if there are more than one 
# #this doesn't work because the fraph is 
# precision_matrix_s = stab_model.precision_
# np.fill_diagonal(precision_matrix_s, 0) #removes self connections by replacing the diagnonal of matrix with 0 

# G_stab = nx.Graph(precision_matrix_s)
# #create a dictionary that maps old node labels to new node labels
# node_labels = {i: label for i, label in enumerate(stable_norm.columns)}

# #relabel the nodes in the graph using the dictionary
# G_stab = nx.relabel_nodes(G_stab, node_labels)

# #visualize the relabeled graph
# pos = nx.random_layout(G_stab)
# nx.draw(G_stab, pos, with_labels=True, font_size=7, style="dotted",
#         node_color=range(len(G_stab)), cmap=plt.cm.Blues,
#         node_size=150*len(G_stab)/154, alpha=.8, width=.3)
# plt.title("Stable MCI Population Graphical Model ADNI", fontsize=22)
# plt.show()

In [None]:
# Assuming stable_norm and stab_models are already defined somewhere

stab_metrics_data = []

for stab_model_num, stab_model in enumerate(stab_models):
    precision_matrix_stab = stab_model.precision_
    np.fill_diagonal(precision_matrix_stab, 0)  # removes self connections by replacing the diagonal of matrix with 0
    G_stab = nx.Graph(precision_matrix_stab)
    # create a dictionary that maps old node labels to new node labels
    stab_node_labels = {i: label for i, label in enumerate(stable_norm.columns)}
    # relabel the nodes in the graph using the dictionary
    G_stab = nx.relabel_nodes(G_stab, stab_node_labels)
    # now calculate the metrics and store them
    stab_gpe = nx.global_efficiency(G_stab)
    stab_clustering_coefficient = nx.average_clustering(G_stab)

    # Assuming you want to store metrics for stab (stability) models,
    # not prog (progression) models as shown in the second snippet
    stab_density = nx.density(G_stab)
    stab_ass_coeff = nx.degree_assortativity_coefficient(G_stab)

    # metrics dictionary
    stab_metrics_dic = {'model_num': stab_model_num,
                        'gpe': stab_gpe,
                        'clus_coeff': stab_clustering_coefficient,
                        'density': stab_density,
                        'ass_coeff': stab_ass_coeff
                        }

    stab_metrics_data.append(stab_metrics_dic)

stab_metrics_data = pd.DataFrame(stab_metrics_data)
stab_metrics_data.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/50_bootstrap_models_stab_metrics.csv')
print(stab_metrics_data)


In [None]:
#instead of calcing mean just calc from data instead 
#add col where the p val is greater than .05

# Initialize lists or a DataFrame to store your results
cols = []
p_values = []
t_stats = []

for col in metrics_data.columns:  
    if col != 'model_num':
    # Get data for this brain region from both datasets
        progs_data = metrics_data[col]
        stab_data = stab_metrics_data[col]
        
        # Perform t-test
        t_stat, p_val = stats.ttest_ind(progs_data, stab_data, equal_var=False)  # Assuming variance might not be equal
        
        # Store results
        cols.append(col)
        t_stats.append(t_stat)
        p_values.append(p_val)

    # Combine results into a DataFrame for easy viewing and further analysis
    results_df = pd.DataFrame({
        'Brain Region': cols,
        'T-Statistic': t_stats,
        'P-Value': p_values
})

# Adding the Significant column based on the P-Value
results_df['Significant'] = results_df['P-Value'] < 0.05

# Display or save the results
print(results_df)
# Optionally save to a CSV file
# results_df.to_csv('t_test_results.csv', index=False)

#the resulting negative t stats mean the progs have lower volume than the stab which is good 
results_df.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/50_bootstrap_models_ttest.csv')


Interpreting t test results:
1. GPE was lower for progressive MCI
-Global path effiency is a measure of the effiency of information exchange across the network, a lower efficency indicates that the graph network is not effiently structured and that information has to travel more indirectly between nodes 

2. clustering coefficent was lower for progressive MCI
- Clustering coefficient is extent to which nodes in a graph are organized into clusters. 
- Specifically, the average clustering coefficient calculates the mean of local clustering coefficients of all the vertices in the network. A higher clustering coefficient indicates a higher degree of clustering in the network.

3. Graph density was lower for progressive MCI 
- density is a measure of how closely knit the network is (low density value is a more sparse network)

4. Assortivity coefficient was lower for progressive MCI 
- correlation coefficient for degrees of connected nodes 
- lower assortativity is indicates that nodes with a high degree tend to connect with nodes of low degree (dissasortive mixing)

In [None]:
# #now try for nacc 
# comb_gmv_nacc = pd.read_csv("/data2/MRI_PET_DATA/graph/csvs/graph_gmv_volumes/mike_gmv_nacc_combo.csv")
# comb_gmv_nacc = comb_gmv_nacc.drop(['Unnamed: 0'], axis=1)
# print(comb_gmv_nacc.dtypes)

In [None]:
nacc_prog_rids = pd.read_csv("/data2/MRI_PET_DATA/graph/NACC/Morph/mri_atlas/morph/roi/NACC_progressors_vol.csv")
nacc_stab_rids = pd.read_csv("/data2/MRI_PET_DATA/graph/NACC/Morph/mri_atlas/morph/roi/NACC_stable_vol.csv")
# print(nacc_prog_rids)

In [None]:
nacc_prog_rids = nacc_prog_rids.drop(['Unnamed: 0'], axis=1) #drop col 
nacc_stab_rids = nacc_stab_rids.drop(['Unnamed: 0'],axis=1)
# print(nacc_stab_rids.dtypes)

In [None]:
print(nacc_stab_rids.dtypes)

In [None]:
#stratify them into prog and stable for nacc 
#double check these are correct 

#do this by cross referencing the stable dfs that I have before from other csv files 
#these are all object datatype so hopefull it will work without converting them to int 

# nacc_prog_rids['RID'] = nacc_prog_rids["RID"].astype('int64')
# nacc_stable_rids['RID']=nacc_stable_rids['RID'].astype('int64')
# comb_gmv_nacc['RID']=comb_gmv_nacc['RID'].astype('int64')

rid_comb_gmv = set(comb_gmv_nacc["RID"])
rid_nacc_prog = set(nacc_prog_rids['RID'])
rid_nacc_stab = set(nacc_stab_rids['RID'])


nacc_gmv_prog = pd.DataFrame()
nacc_gmv_stab = pd.DataFrame()


#itterows allows you to iterate through rows of a df (it returns an iterator which is a pairs of index and series (the data for that row) for each row)
for index, row in comb_gmv_nacc.iterrows():
    rid = row["RID"]

    if rid in rid_nacc_prog:
        #if that rid exists in adni_progs then append the row to adni_gmv
        nacc_gmv_prog = nacc_gmv_prog.append(row,ignore_index=True)
    else:  
        nacc_gmv_stab = nacc_gmv_stab.append(row,ignore_index=True) #if they arent prog they are stab 

print(nacc_gmv_stab)

In [None]:
#nacc_gmv_prog 
nacc_gmv_prog = nacc_gmv_prog.drop(['RID','TIV','Dataset'], axis=1)
nacc_gmv_stab = nacc_gmv_stab.drop(['RID','TIV','Dataset'], axis=1)

In [None]:
#drop regions with 0 volume 
nacc_gmv_prog = nacc_gmv_prog.drop(['vol_lInfLatVen','vol_rInfLatVen','vol_lOC','vol_rOC'], axis=1)
nacc_gmv_stab = nacc_gmv_stab.drop(['vol_lInfLatVen','vol_rInfLatVen','vol_lOC','vol_rOC'],axis=1)

In [None]:
#check nacc gmv prog for 0 values

# Assuming nacc_gmv_prog is your dataframe
zero_values = (nacc_gmv_prog == 0).sum().sum()

print(f'Total zero values in the DataFrame: {zero_values}')



In [None]:
# print(nacc_gmv_stab)

In [None]:
#I guess scale the data like from original code because it involves distance metric 
#have to do standard scalar normalization 
#it normalizes by column 
#sklearn standard scalar 
scaler = StandardScaler()
#fit to your data and apply the transformation to your data 
nacc_progs_norm = scaler.fit_transform(nacc_gmv_prog)
nacc_stable_norm = scaler.fit_transform(nacc_gmv_stab)
print(nacc_progs_norm.shape)
print(nacc_progs_norm)

In [None]:
#the output of standard scaler is a numpy array so need to reconvert it 
nacc_progs_norm = pd.DataFrame(nacc_progs_norm, columns=nacc_gmv_prog.columns)
nacc_stable_norm = pd.DataFrame(nacc_stable_norm,columns=nacc_gmv_stab.columns)
print(progs_norm)

In [None]:
print(nacc_progs_norm.isnull().sum().sum())  # Sum of NaN values
print(nacc_progs_norm.isin([np.inf, -np.inf]).sum().sum())  # Sum of Inf values


In [None]:
#check nacc gmv prog for 0 values

# Assuming nacc_gmv_prog is your dataframe
zero_values = (nacc_progs_norm == 0).sum().sum()

print(f'Total zero values in the DataFrame: {zero_values}')


In [None]:
nan_mask = nacc_progs_norm.isna()
nan_count = nan_mask.sum().sum()
print(nan_count)

In [None]:
assert not nacc_progs_norm.isnull().values.any(), "Initial DataFrame has NaN values"
assert not np.isinf(nacc_progs_norm).values.any(), "Initial DataFrame has Inf values"


In [None]:
#run bootstrapping (with replacement) before the graphical lasso cv () 
#random_state = None means random number generator is inistalized, produces different set of rnadom samples each time 
#setting random state to 1 so that its reproducable 

#set random generator outside loop 
#seed everytrhing once at begibning and dont touch it 
# np.random.seed(1)


num_bootstrap_samples = 50
nacc_bootstrap_precision_matrices = []
nacc_bootstrap_samples = []
for x in range(num_bootstrap_samples):
    #sample with replacement, randomstate =1 for reporoducibility 
    nacc_bootstrap_sample = nacc_progs_norm.sample(n=len(nacc_progs_norm),replace=True)
    #append resamples df to a list 
    nacc_bootstrap_samples.append(nacc_bootstrap_sample)


In [None]:
print(nacc_bootstrap_sample.dtypes)


In [None]:
# Check that all dataframes have the same datatypes for each column
datatypes = nacc_bootstrap_samples[0].dtypes  # Get datatypes of the first dataframe
same_datatypes = all((df.dtypes == datatypes).all() for df in nacc_bootstrap_samples)

print(f'All dataframes have the same datatypes for each column: {same_datatypes}')


In [None]:


# Assuming nacc_bootstrap_samples is your list of DataFrames
for i, df in enumerate(nacc_bootstrap_samples):
    if df.isnull().values.any():
        print(f'NaN values found in DataFrame at index {i}')
    else:
        print(f'No NaN values found in DataFrame at index {i}')

# If you want to see the total count of NaN values across all DataFrames:
total_nan_count = sum(df.isnull().sum().sum() for df in nacc_bootstrap_samples)
print(f'Total NaN count across all DataFrames: {total_nan_count}')


In [None]:
import pandas as pd
import numpy as np

# Assuming nacc_bootstrap_samples is your list of DataFrames
for i, df in enumerate(nacc_bootstrap_samples):
    if np.isinf(df).values.any():
        print(f'Infinite values found in DataFrame at index {i}')
    else:
        print(f'No infinite values found in DataFrame at index {i}')

# If you want to see the total count of infinite values across all DataFrames:
total_inf_count = sum(np.isinf(df).sum().sum() for df in nacc_bootstrap_samples)
print(f'Total infinite count across all DataFrames: {total_inf_count}')


In [None]:
# for index, value in enumerate(nacc_bootstrap_samples):
#     if math.isnan(value):
#         print(f'NaN found at index {index}')

In [None]:
#run graphical model on bootstrapped sample 
#it worked changing the random seed so fingers crossed 
#setting random seed to 3 changed it so it wasn't generating sample with inf or nan values 
#setting random seed as 3 and 100 samples works 
nacc_models = []
for nacc_bootstrap_sample in tqdm(nacc_bootstrap_samples):
    nacc_model = GraphicalLassoCV(cv=5,max_iter=50, tol=1e-3)
    nacc_model.fit(nacc_bootstrap_sample)
    nacc_models.append(nacc_model) 

#error that it must not contain infs or nans 

In [None]:
# #check for nan or inf values - dont need anymore since I fixed it 

# #I don't need to use this because the 100 samples + random seed 3 is good without it 
# nacc_models = []
# for nacc_bootstrap_sample in tqdm(nacc_bootstrap_samples):
#     if nacc_bootstrap_sample.isnull().values.any() or np.isinf(nacc_bootstrap_sample).values.any():
#         print('NaN or Inf found')
#     nacc_model = GraphicalLassoCV(cv=2,max_iter=50, tol=1e-3)
#     nacc_model.fit(nacc_bootstrap_sample)
#     nacc_models.append(nacc_model)

# #try logging 
# import logging
# logging.basicConfig(level=logging.INFO)


# nacc_models = []
# for idx, nacc_bootstrap_sample in enumerate(tqdm(nacc_bootstrap_samples)):
#     # Check for NaN/Inf values
#     if nacc_bootstrap_sample.isnull().values.any() or np.isinf(nacc_bootstrap_sample).values.any():
#         logging.info(f'NaN or Inf found in sample {idx}')

#     # Try to fit the model
#     #try and except block structure 
#     try:
#         nacc_model = GraphicalLassoCV(cv=2, max_iter=50, tol=1e-3)
#         nacc_model.fit(nacc_bootstrap_sample)
#         nacc_models.append(nacc_model)
#     except ValueError as e:
#         logging.error(f'Error encountered in sample {idx}: {e}')
#         logging.error(f'Data: {nacc_bootstrap_sample}')



In [None]:
# # need anymore I fixed the issue 
# import logging
# from tqdm import tqdm
# import pandas as pd
# from sklearn.covariance import GraphicalLassoCV

# logging.basicConfig(level=logging.INFO)

# nacc_models = []
# for idx, nacc_bootstrap_sample in enumerate(tqdm(nacc_bootstrap_samples)):
#     # Locate NaN and Inf values
#     nan_locations = nacc_bootstrap_sample.isnull().stack()[lambda x: x]
#     inf_locations = nacc_bootstrap_sample.isin([np.inf, -np.inf]).stack()[lambda x: x]

#     if not nan_locations.empty or not inf_locations.empty:
#         logging.info(f'NaN or Inf found in sample {idx}')
#         for location in nan_locations.index:
#             logging.info(f'NaN found at row {location[0]}, column {location[1]}')
#         for location in inf_locations.index:
#             logging.info(f'Inf found at row {location[0]}, column {location[1]}')

#     # Try to fit the model
#     try:
#         nacc_model = GraphicalLassoCV(cv=2, max_iter=50, tol=1e-3)
#         nacc_model.fit(nacc_bootstrap_sample)
#         nacc_models.append(nacc_model)
#     except ValueError as e:
#         logging.error(f'Error encountered in sample {idx}: {e}')
#         logging.error(f'Data: {nacc_bootstrap_sample}')


In [None]:
#so there is one model that is throwing off the whole code in this 
print(len(nacc_models))

In [None]:
#calc metrics - I think it was calcing the adni prog one (but they still look the same)
# iterating through each bootsrap sample model, calculating the graph metrics, and then storing the metrics for eahc model in a dataframe 
nacc_metrics_data = []

#keep track of index with enumerate 
for nacc_model_num, nacc_model in enumerate(nacc_models,start=1):
    nacc_precision_matrix_p = nacc_model.precision_
    np.fill_diagonal(nacc_precision_matrix_p, 0) #removes self connections by replacing the diagnonal of matrix with 0 
    nacc_G_prog = nx.Graph(nacc_precision_matrix_p)
    nacc_node_labels = {i: label for i, label in enumerate(nacc_progs_norm.columns)}
    #relabel the nodes in the graph using the dictionary
    nacc_G_prog = nx.relabel_nodes(nacc_G_prog, nacc_node_labels)
    #now calculate the metrics and store them 
    nacc_gpe = nx.global_efficiency(nacc_G_prog) 
    nacc_clustering_coefficient = nx.average_clustering(nacc_G_prog)
    nacc_density = nx.density(nacc_G_prog)
    # eigen_cen = nx.eigenvector_centrality(G_prog)
    nacc_ass_coeff = nx.degree_assortativity_coefficient(nacc_G_prog)
    # diameter = nx.diameter(G_prog)
    # radius = nx.radius(G_prog)


    #create dictionary for current models data 
    nacc_metrics_dic = {
        'model_num': nacc_model_num,
        'gpe': nacc_gpe,
        'clus_coeff':nacc_clustering_coefficient,
        'density': nacc_density,
        'ass_coeff': nacc_ass_coeff
    }
    nacc_metrics_data.append(nacc_metrics_dic)


#turn the list of dictionaries with each model info into a df
nacc_metrics_data = pd.DataFrame(nacc_metrics_data)
nacc_metrics_data.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/full_bootstrap_models_prog_metrics_nacc.csv')
# print(metrics_data.dtypes)
print(nacc_metrics_data.shape)

In [None]:
#nacc_gmv_stab
#run bootstrap for stable df 
# np.random.seed(3)
nacc_stab_num_bootstrap_samples = 50
nacc_stab_bootstrap_precision_matrices = []
nacc_stab_bootstrap_samples = []
for x in range(nacc_stab_num_bootstrap_samples):
    #sample with replacement, randomstate =1 for reporoducibility 
    nacc_stab_bootstrap_sample = stable_norm.sample(n=len(stable_norm),replace=True)
    #append resamples df to a list 
    nacc_stab_bootstrap_samples.append(nacc_stab_bootstrap_sample)

In [None]:
#run graphical model on bootstrapped sample 
nacc_stab_models = []
for nacc_stab_bootstrap_sample in tqdm(nacc_stab_bootstrap_samples):
    #if I give it alphas [1,10] it looks like it drops almost all of the connections 
    
    nacc_stab_model = GraphicalLassoCV(cv=5,max_iter=50,tol=1e-3)
    nacc_stab_model.fit(nacc_stab_bootstrap_sample)
    nacc_stab_models.append(nacc_stab_model) 

In [None]:
print(len(nacc_stab_models))

In [None]:
# Assuming stable_norm and stab_models are already defined somewhere

nacc_stab_metrics_data = []

for nacc_model_num, nacc_stab_model in enumerate(nacc_stab_models):
    nacc_precision_matrix_stab = nacc_stab_model.precision_
    np.fill_diagonal(nacc_precision_matrix_stab, 0)  # removes self connections by replacing the diagonal of matrix with 0
    nacc_G_stab = nx.Graph(nacc_precision_matrix_stab)
    # create a dictionary that maps old node labels to new node labels
    naccstab_node_labels = {i: label for i, label in enumerate(nacc_stable_norm.columns)}
    # relabel the nodes in the graph using the dictionary
    nacc_G_stab = nx.relabel_nodes(nacc_G_stab, naccstab_node_labels)
    # now calculate the metrics and store them
    nacc_stab_gpe = nx.global_efficiency(nacc_G_stab)
    nacc_stab_clustering_coefficient = nx.average_clustering(nacc_G_stab)

    # Assuming you want to store metrics for stab (stability) models,
    # not prog (progression) models as shown in the second snippet
    nacc_stab_density = nx.density(nacc_G_stab)
    nacc_stab_ass_coeff = nx.degree_assortativity_coefficient(nacc_G_stab)

    # metrics dictionary
    nacc_stab_metrics_dic = {'model_num': nacc_model_num,
                        'gpe': nacc_stab_gpe,
                        'clus_coeff': nacc_stab_clustering_coefficient,
                        'density': nacc_stab_density,
                        'ass_coeff': nacc_stab_ass_coeff
                        }

    nacc_stab_metrics_data.append(nacc_stab_metrics_dic)

nacc_stab_metrics_data = pd.DataFrame(nacc_stab_metrics_data)
nacc_stab_metrics_data.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/50_bootstrap_models_stab_metrics_nacc.csv')
print(nacc_stab_metrics_data)


In [None]:
#
print(nacc_metrics_data.shape)
print(nacc_stab_metrics_data.shape)

In [None]:
#nacc t test 
#instead of calcing mean just calc from data instead 
#add col where the p val is greater than .05

# Initialize lists or a DataFrame to store your results
cols = []
p_values = []
t_stats = []

# Note: The dataframe should be adni_gmv_progs, not adni_gmv_prog
for col in nacc_metrics_data.columns:  
    if col != 'model_num':
    # Get data for this brain region from both datasets
        nacc_progs_data = nacc_metrics_data[col]
        nacc_stab_data = nacc_stab_metrics_data[col]
        
        # Perform t-test
        t_stat, p_val = stats.ttest_ind(nacc_progs_data, nacc_stab_data, equal_var=False)  # Assuming variance might not be equal
        
        # Store results
        cols.append(col)
        t_stats.append(t_stat)
        p_values.append(p_val)

    # Combine results into a DataFrame for easy viewing and further analysis
    nacc_results_df = pd.DataFrame({
        'Brain Region': cols,
        'T-Statistic': t_stats,
        'P-Value': p_values
})

# Adding the Significant column based on the P-Value
nacc_results_df['Significant'] = nacc_results_df['P-Value'] < 0.05

# Display or save the results
print("Nacc t test results between nacc prog and nacc stab")
print(nacc_results_df)
# Optionally save to a CSV file
# results_df.to_csv('t_test_results.csv', index=False)

#The results previously have been saved so I could even just load them 
#the resulting negative t stats mean the progs have lower volume than the stab which is good 
nacc_results_df.to_csv('/data2/MRI_PET_DATA/graph/csvs/bootstrap_models_and_metrics/50_bootstrap_models_ttest_nacc.csv')


In [None]:
print("Adni t test results between adni prog and adni stab")
print(results_df)
#andi and nacc results are the exact same so need to figure out why they are not 