In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from adjustText import adjust_text
import numpy as np
from scipy import stats
import scipy

In [2]:
colors = ["#66CDAA","#4682B4","#AB63FA","#FFA15A"]
group_num = "group_5"
df_main = pd.read_excel("/Users/yilewang/workspaces/data4project/prediction_project/Multigroups.xlsx",sheet_name=group_num)

### Workflow

1. take the z-score
2. identify cases with z-score > 1.6
3. get the frequency of being an outlier in all features
4. get the final list

In [3]:
df_zscore = pd.read_excel("/Users/yilewang/workspaces/data4project/prediction_project/prediction_data_zscores.xlsx")
# for each variable, find the one larger than 1.6.
df_zscore_hyper = df_zscore.copy()
upper_bound = 1.6 # 90 percentile
# # ignition
for i in range(2, 18):
    df_zscore_hyper[df_zscore_hyper.columns[i]] = df_zscore[df_zscore.columns[i]].apply(lambda x: 1 if x > upper_bound else 0)
# TVB parameters and simulation
for i in range(18, 34):
    df_zscore_hyper[df_zscore_hyper.columns[i]] = df_zscore[df_zscore.columns[i]].apply(lambda x: 1 if x > upper_bound else 0)
# Lateralization Index
for i in range(34, 38):
    df_zscore_hyper[df_zscore_hyper.columns[i]] = df_zscore[df_zscore.columns[i]].apply(lambda x: 1 if x < -1*upper_bound else 0)
# # structural
for i in range(38, 64):
    df_zscore_hyper[df_zscore_hyper.columns[i]] = df_zscore[df_zscore.columns[i]].apply(lambda x: 1 if x > upper_bound else 0)


In [38]:
df_zscore_hyper["ignition_sum"] = df_zscore_hyper.iloc[:,2:18].sum(axis=1)
df_zscore_hyper["sc_sum"] = df_zscore_hyper.iloc[:,38:64].sum(axis=1)
df_zscore_hyper["rest_sum"] = df_zscore_hyper.iloc[:,18:38].sum(axis=1)
df_zscore_hyper['subgroups'] = df_zscore_hyper['sc_sum'].apply(lambda x: 1 if x > 0 else 0)

# df_zscore_hyper['subgroups'] = df_zscore_hyper['rest_sum']+ df_zscore_hyper['sc_sum'].apply(lambda x: 1 if x > 0 else 0) + df_zscore_hyper['ignition_sum'].apply(lambda x: 1 if x > 0 else 0)
df_zscore_hyper[df_zscore_hyper['group'] == "MCI"][['caseid', 'subgroups']]

Unnamed: 0,caseid,subgroups
26,0578A,1
27,0720A,0
28,0743A,0
29,1070A,0
30,1428A,1
31,1669A,0
32,1775A,0
33,2019A,0
34,2050A,1
35,2166A,0


### get the list of predicted converters in MCI

In [6]:
predicted_converters_MCI = ['0720A',
'1669A',
'2166A',
'3117A',
'4628A',
'5914A',
'6516A',
'7873A',
'2019A',
'2050A',
'4216A']

In [7]:
def get_overlap(pd_df, predicted_converters= predicted_converters_MCI):
    # get the overlap between predicted converters and the subgroups
    for case in pd_df.caseid:
        if case in predicted_converters:
            print(case)

In [8]:
high_hyperexcitability = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][df_zscore_hyper['subgroups'] > 0][['caseid', 'subgroups']]
print(high_hyperexcitability)
get_overlap(high_hyperexcitability)

   caseid  subgroups
27  0720A          3
30  1428A          3
31  1669A          3
32  1775A          3
33  2019A          1
34  2050A          1
36  2385A          3
37  2391A          1
45  4602A          1
46  4628A          2
49  5781A          2
51  6516A          1
54  7827A          1
55  7841A          1
60  9270A          2
0720A
1669A
2019A
2050A
4628A
6516A


  high_hyperexcitability = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][df_zscore_hyper['subgroups'] > 0][['caseid', 'subgroups']]


### Summary

Metrics: 
- Gc: trends increase from SNC to AD. 
- Go: increase from SNC to AD. 
- G_max: quadratic trend. 
- G_max - Gc: quadratic. 
- K21: bump in the NC and MCI
- Gamma_Freq: quadratic trend.
- Theta_Freq: quadratic trend.
- Gamma_Amp: quadratic trend.
- Theta_Amp: quadratic trend.
- Delay: increase from SNC to AD.
- Gamma/Theta: clean quadratic trend.
- LI (Lateralization Index): quadratic

Reasons not including the following features:
- Ignition: ignition is already down.
- Structural: structural connectomes don't have hyperexcitability.

If groups = 5, then
- SNC
- NC
- MCI-hypoexcitability
- MCI
- AD

If groups = 6, then
- SNC
- NC
- MCI-converter_hypoexcitability
- MCI-converter_hyperexcitability
- MCI
- AD

If groups = 7, then
- SNC
- NC
- MCI-converter_hypoexcitability
- MCI-converter_hyperexcitability_high
- MCI-converter_hyperexcitability_low
- MCI
- AD

In [13]:
### If group = 5
# create a pd dataframe for MCI_converters
MCI_converters = pd.DataFrame(predicted_converters_MCI, columns = ["caseid"])
MCI_converters['group'] = "MCI_converters"

# get the list of caseid in MCI excluding the group_5
_MCI = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][['caseid', 'subgroups']]
MCI_nonconverters = _MCI[~_MCI['caseid'].isin(MCI_converters["caseid"])]

MCI_nonconverters['group'] = "MCI"
MCI_nonconverters = MCI_nonconverters[['caseid', 'group']]

# put them together
AD = df_zscore_hyper[df_zscore_hyper['group'] == 'AD'][['caseid', 'group']][:-1]
NC = df_zscore_hyper[df_zscore_hyper['group'] == 'NC'][['caseid', 'group']]
SNC = df_zscore_hyper[df_zscore_hyper['group'] == 'SNC'][['caseid', 'group']]

# concatenate them
df_group5 = pd.concat([SNC, NC, MCI_converters, MCI_nonconverters, AD], ignore_index=True)

# df_group5.to_excel
df_group5.to_excel("./group5.xlsx",sheet_name="group_5", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MCI_nonconverters['group'] = "MCI"


In [14]:

### If group = 6

# get the list in high_hyperexcitability excluding the predicted converters
MCI_hyperexcitability = high_hyperexcitability[~high_hyperexcitability['caseid'].isin(predicted_converters_MCI)]
MCI_hyperexcitability['group'] = "MCI_hyperexcitability"
MCI_hyperexcitability = MCI_hyperexcitability[['caseid', 'group']]

# get the hypoexcitability
MCI_hypoexcitability = pd.DataFrame({"caseid": predicted_converters_MCI, "group": "MCI_hypoexcitability"})

# get the list of caseid in MCI excluding the MCI_hyperexcitability and MCI_hypoexcitability
_MCI = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][['caseid', 'subgroups']]
MCI_nonconverters = _MCI[~_MCI['caseid'].isin(MCI_hyperexcitability['caseid'])]
MCI_nonconverters = MCI_nonconverters[~MCI_nonconverters['caseid'].isin(MCI_hypoexcitability['caseid'])]
MCI_nonconverters['group'] = "MCI"
MCI_nonconverters = MCI_nonconverters[['caseid', 'group']]

# put them together
AD = df_zscore_hyper[df_zscore_hyper['group'] == 'AD'][['caseid', 'group']][:-1]
NC = df_zscore_hyper[df_zscore_hyper['group'] == 'NC'][['caseid', 'group']]
SNC = df_zscore_hyper[df_zscore_hyper['group'] == 'SNC'][['caseid', 'group']]

# concatenate them
df_group6 = pd.concat([SNC, NC, MCI_hyperexcitability, MCI_hypoexcitability, MCI_nonconverters, AD], ignore_index=True)

df_group6.to_excel("./group6.xlsx",sheet_name="group_6", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MCI_hyperexcitability['group'] = "MCI_hyperexcitability"


In [35]:
### If group = 7


# get the hypoexcitability
MCI_hypoexcitability = pd.DataFrame({"caseid": predicted_converters_MCI, "group": "MCI_hypoexcitability"})

### get the high and low hyperexcitability
MCI_hyperexcitability_high = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][df_zscore_hyper['subgroups'] > 2][['caseid', 'subgroups']][~df_zscore_hyper['caseid'].isin(MCI_hypoexcitability['caseid'])]
MCI_hyperexcitability_low = df_zscore_hyper[(df_zscore_hyper['group'] == 'MCI') & (df_zscore_hyper['subgroups'] <= 2) & (df_zscore_hyper['subgroups'] > 0)][['caseid', 'subgroups']][~df_zscore_hyper['caseid'].isin(MCI_hypoexcitability['caseid'])]

### For MCI hyperexcitability high and low cases.
MCI_hyperexcitability_high['group'] = "MCI_hyperexcitability_high"
MCI_hyperexcitability_high = MCI_hyperexcitability_high[['caseid', 'group']]
MCI_hyperexcitability_low['group'] = "MCI_hyperexcitability_low"
MCI_hyperexcitability_low = MCI_hyperexcitability_low[['caseid', 'group']]

# get all the cases not in previous groups
_MCI = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][['caseid', 'subgroups']]
high_cases = MCI_hyperexcitability_high['caseid']
low_cases = MCI_hyperexcitability_low['caseid']

MCI_nonconverters = _MCI[~_MCI['caseid'].isin(high_cases)]
MCI_nonconverters = MCI_nonconverters[~MCI_nonconverters['caseid'].isin(low_cases)]
MCI_nonconverters = MCI_nonconverters[~MCI_nonconverters['caseid'].isin(MCI_hypoexcitability['caseid'])]

MCI_nonconverters['group'] = "MCI"
MCI_nonconverters = MCI_nonconverters[['caseid', 'group']]

# concatenate them
df_group7 = pd.concat([SNC, NC, MCI_hyperexcitability_high, MCI_hyperexcitability_low, MCI_hypoexcitability, MCI_nonconverters, AD], ignore_index=True)
df_group7 = df_group7[['caseid', 'group']]

df_group7.to_excel("./group7.xlsx",sheet_name="group_7", index=False)

  MCI_hyperexcitability_high = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][df_zscore_hyper['subgroups'] > 2][['caseid', 'subgroups']][~df_zscore_hyper['caseid'].isin(MCI_hypoexcitability['caseid'])]
  MCI_hyperexcitability_high = df_zscore_hyper[df_zscore_hyper['group'] == 'MCI'][df_zscore_hyper['subgroups'] > 2][['caseid', 'subgroups']][~df_zscore_hyper['caseid'].isin(MCI_hypoexcitability['caseid'])]
  MCI_hyperexcitability_low = df_zscore_hyper[(df_zscore_hyper['group'] == 'MCI') & (df_zscore_hyper['subgroups'] <= 2) & (df_zscore_hyper['subgroups'] > 0)][['caseid', 'subgroups']][~df_zscore_hyper['caseid'].isin(MCI_hypoexcitability['caseid'])]
