# Getting Data

First, we want to grab some graphs and subject covariates from a web-accessible url.  We've given this to you on google drive rather than having you set up aws s3 credentials in the interest of saving time. The original data is hosted at m2g.io

Below, you will be getting the following dataset:

| Property | Value |
|:--------:|:-----:|
| Dataset  | SWU4  |
| N-Subjects  | 454   |
| Scans-per-subjects | 2 |
| Atlases | Desikan, CPAC200 |
| Desikan Nodes | 70 |
| CPAC200 Nodes | 200 |

The covariates you have are: `SUBID, SESSION, AGE_AT_SCAN_1, SEX, RESTING_STATE_INSTRUCTION, TIME_OF_DAY, SEASON, SATIETY, LMP`. There are other columns in the `.csv` file (downloaded in the next step) but they are populated with a `#` meaning that the value was not recorded.

There are several other atlases available - you can change which one you use 
Running the cell below will get you the data. **Please note, you only have to run these two cells once!!!**

## Loading Graphs + Covariates
Run the following cells of code to load the graphs into your computer, as well as the covariates.

In [1]:
!pip install networkx==1.9 #networkx broke backwards compatibility with these graph files
import numpy as np
import networkx as nx
import scipy as sp
import matplotlib.pyplot as plt
import os
import csv


from collections import OrderedDict



In [2]:
# Initializing dataset names
dataset_names = 'SWU4'

basepath = 'data'

# change which atlas you use, here!

atlas = 'desikan' # 'desikan' # or 'CPAC200', or 'Talairach'
dir_names = basepath + '/' + dataset_names + '/' + atlas
#basepath = "/"
#dir_names = basepath
print(dir_names)
fs = OrderedDict()
fs[dataset_names] = [root + "/" + fl for root, dirs, files in os.walk(dir_names)
                     for fl in files if fl.endswith(".gpickle")]

ps = "data/SWU4/SWU4.csv"

print("Datasets: " + ", ".join([fkey + " (" + str(len(fs[fkey])) + ")"
                                for fkey in fs]))
print("Total Subjects: %d" % (sum([len(fs[key]) for key in fs])))

data/SWU4/desikan
Datasets: SWU4 (454)
Total Subjects: 454


In [3]:
def loadGraphs(filenames, verb=False):
    """
    Given a list of files, returns a dictionary of graphs

    Required parameters:
        filenames:
            - List of filenames for graphs
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Initializes empty dictionary
    gstruct = OrderedDict()
    for idx, files in enumerate(filenames):
        if verb:
            print("Loading: " + files)
        #  Adds graphs to dictionary with key being filename
        fname = os.path.basename(files)
        gstruct[fname] = nx.read_gpickle(files)
    return gstruct

def constructGraphDict(names, fs, verb=False):
    """
    Given a set of files and a directory to put things, loads graphs.

    Required parameters:
        names:
            - List of names of the datasets
        fs:
            - Dictionary of lists of files in each dataset
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Loads graphs into memory for all datasets
    graphs = OrderedDict()
    if verb:
        print("Loading Dataset: " + names)
    # The key for the dictionary of graphs is the dataset name
    graphs[names] = loadGraphs(fs[names], verb=verb)
    return graphs

In [4]:
graphs = constructGraphDict(dataset_names, fs, verb=False)

In [5]:
import csv
# This gets age and sex, respecitvely.
tmp = csv.reader(open(ps,newline='')) # this is the whole phenotype file
pheno = OrderedDict()
triple = [[t[0].strip(), t[2], int(t[3] == '2')] for t in tmp
          if t[3] != '#' and t[2] != '#'][1:]  # female=1->0, male=2->1

for idx, trip in enumerate(triple):
    pheno[trip[0]] = trip[1:]

In [6]:
## replace with this

k = sorted(list(graphs['SWU4'].keys()))
k_id = list(key[6:11] for key in k)
k_id = k_id[0::2]
k_g1 = k[0::2]

g1 = []
for xx in k_g1:
    g1.append(graphs['SWU4'][xx])

#Create vectors of labels
age = list()
sex = list()

for key in k_id:
    sex.append(pheno[key][1])
    age.append(pheno[key][0])

## ASSIGNMENT:  
(Code above used to get data in the correct format.  Below is a simple example test string with kind of silly features)

In [14]:
#Combine features, separate training and test data

X = []
for i in range(len(g1)):
    featvec = []
    
    matrix = nx.to_numpy_matrix(g1[i], nodelist=sorted(g1[i].nodes())) #this is how you go to a matrix
    logmatrix = np.log10(np.sum(matrix,0) + 1)
    logmatrix = np.ravel(logmatrix)
        
    for ii in logmatrix:
        featvec.append(ii)
        
    for ii in nx.clustering(g1[i]).values():
        featvec.append(ii)
        
    featvec.append(nx.node_connectivity(g1[i]))
    
#     for ii in nx.degree_centrality(g1[i]).values():
#         featvec.append(ii)
        
    for ii in nx.triangles(g1[i]).values():
        featvec.append(ii)
        
#     featvec.append(nx.is_bipartite(g1[i]))
    
    xsum = np.asarray(np.sum(matrix))
    featvec.append(xsum)
    
    np.shape(featvec)
    X.append(featvec)
    


In [31]:
X_train = X[0:100]
Y_train = sex[0:100]

X_test = X[100:200]
Y_test = sex[100:200]

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

from sklearn.ensemble import RandomForestClassifier
accuracy = []
for i in range(10): #performance will change over time
    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X_train, Y_train)
    acc = (clf.predict(X_test) == Y_test)
#     acc = Y_test
#     predicted = clf.predict(X_test)
#     for i in range(0,len(Y_test)):
#         acc[i] = predicted[i] == Y_test[i]
    accval = (float(np.sum(acc))/float(len(Y_test)))
    accuracy.append(accval)
    print('Accuracy:',accval)

print('Overall Accuracy:',str(np.mean(accuracy)))

Accuracy: 0.56
Accuracy: 0.6
Accuracy: 0.58
Accuracy: 0.58
Accuracy: 0.57
Accuracy: 0.58
Accuracy: 0.57
Accuracy: 0.59
Accuracy: 0.61
Accuracy: 0.58
Overall Accuracy: 0.582


In [30]:
from sklearn import svm
accuracy = []
for i in range(1): #performance will change over time
    clf = svm.SVC(kernel='rbf')
    clf.fit(X_train, Y_train)
    acc = (clf.predict(X_test) == Y_test)
#     acc = Y_test
#     predicted = clf.predict(X_test)
#     for i in range(0,len(Y_test)):
#         acc[i] = predicted[i] == Y_test[i]
    accval = (float(np.sum(acc))/float(len(Y_test)))
    accuracy.append(accval)
    print('Accuracy:',accval)

print('Overall Accuracy:',str(np.mean(accuracy)))

Accuracy: 0.66
Overall Accuracy: 0.66


In [17]:
from sklearn.linear_model import SGDClassifier
accuracy = []
for i in range(1): #performance will change over time
    clf = SGDClassifier(loss='log')
    clf.fit(X_train, Y_train)
    acc = (clf.predict(X_test) == Y_test)
#     acc = Y_test
#     predicted = clf.predict(X_test)
#     for i in range(0,len(Y_test)):
#         acc[i] = predicted[i] == Y_test[i]
    accval = (float(np.sum(acc))/float(len(Y_test)))
    accuracy.append(accval)
    print('Accuracy:',accval)

print('Overall Accuracy:',str(np.mean(accuracy)))

Accuracy: 0.56
Overall Accuracy: 0.56


In [29]:
from sklearn.ensemble import GradientBoostingClassifier

accuracy = []
for i in range(10): #performance will change over time
    clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1)
    clf.fit(X_train, Y_train)
    acc = (clf.predict(X_test) == Y_test)
#     acc = Y_test
#     predicted = clf.predict(X_test)
#     for i in range(0,len(Y_test)):
#         acc[i] = predicted[i] == Y_test[i]
    accval = (float(np.sum(acc))/float(len(Y_test)))
    accuracy.append(accval)
    print('Accuracy:',accval)

print('Overall Accuracy:',str(np.mean(accuracy)))

Accuracy: 0.59
Accuracy: 0.56
Accuracy: 0.61
Accuracy: 0.61
Accuracy: 0.59
Accuracy: 0.6
Accuracy: 0.63
Accuracy: 0.63
Accuracy: 0.61
Accuracy: 0.6
Overall Accuracy: 0.603


In [24]:
from sklearn.neural_network import MLPClassifier

accuracy = []
for i in range(10): #performance will change over time
    clf = MLPClassifier(hidden_layer_sizes=(100,100), solver='adam', activation='relu')
    clf.fit(X_train, Y_train)
    acc = (clf.predict(X_test) == Y_test)
#     acc = Y_test
#     predicted = clf.predict(X_test)
#     for i in range(0,len(Y_test)):
#         acc[i] = predicted[i] == Y_test[i]
    accval = (float(np.sum(acc))/float(len(Y_test)))
    accuracy.append(accval)
    print('Accuracy:',accval)

print('Overall Accuracy:',str(np.mean(accuracy)))

Accuracy: 0.61
Accuracy: 0.64
Accuracy: 0.63
Accuracy: 0.6
Accuracy: 0.61
Accuracy: 0.65
Accuracy: 0.58
Accuracy: 0.64
Accuracy: 0.61
Accuracy: 0.66
Overall Accuracy: 0.623


In [None]:
# plot a graph
import matplotlib.pyplot as plt
%matplotlib inline

# convert to numpy
matrix = nx.to_numpy_matrix(g1[15], nodelist=sorted(g1[15].nodes())) #this is how you go to a matrix

plt.imshow(np.log10(matrix+1))
plt.colorbar()
plt.title('connectome example')
plt.show()

#compute mean connectome
mean_matrix = np.zeros((70,70),dtype=np.int16)
for i in range(0,len(d)):
    mean_matrix = mean_matrix + nx.to_numpy_matrix(g1[i], nodelist=sorted(g1[i].nodes()))
mean_matrix = mean_matrix / len(g1)

plt.imshow(np.log10(mean_matrix+1))
plt.colorbar()
plt.title('mean connectome')
plt.show()

#compute sex-conditional connectomes
male_matrix = np.zeros((70,70),dtype=np.int16)
male_count = 0
for i in range(0,len(d)):
    if(sex[i] == 0):
        male_matrix = male_matrix + nx.to_numpy_matrix(g1[i], nodelist=sorted(g1[i].nodes()))
        male_count = male_count + 1
male_matrix = male_matrix / male_count

plt.imshow(np.log10(male_matrix+1))
plt.colorbar()
plt.title('male mean connectome')
plt.show()

female_matrix = np.zeros((70,70),dtype=np.int16)
female_count = 0
for i in range(0,len(d)):
    if(sex[i] == 1):
        female_matrix = female_matrix + nx.to_numpy_matrix(g1[i], nodelist=sorted(g1[i].nodes()))
        female_count = female_count + 1
female_matrix = female_matrix / female_count

plt.imshow(np.log10(female_matrix+1))
plt.colorbar()
plt.title('female mean connectome')
plt.show()

print(male_count)
print(female_count)