In [1]:
import os

import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
import dionysus as dion
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from pt_activation.models.ccff_fashion import CCFF

%load_ext autoreload
%autoreload 2

In [2]:
def create_filtrations(model, batch_size, up_to):
    device = torch.device("cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True}
    transform = transforms.Compose([transforms.ToTensor()])
    
    testset = datasets.FashionMNIST(root='../data/fashion', train=False,
                                           download=True, transform=transform)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                             shuffle=False, num_workers=2)

    model.eval()
    test_loss = 0
    correct = 0
    t = 0
    res_df = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output, hiddens = model(data, hiddens=True)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
            for s in range(data.shape[0]):
                # check if this makes sense
                num_hids = len(hiddens)
                this_hiddens = [hiddens[i][s] for i in range(num_hids)]
                print('Filtration: {}'.format(s+t))
                f = model.compute_dynamic_filtration(data[s], this_hiddens)
                #
                row = {'filtration':f, 'loss':output.cpu().numpy()[s][0], 'class':target.cpu().numpy()[s], 'prediction':pred.cpu().numpy()[s][0]}
                res_df.append(row)

            t += batch_size
            if t >= up_to:
                break

    return pd.DataFrame(res_df)

In [3]:
model_location = '/home/tgebhart/projects/pt_activation/logdir/models/ccff_fashion.pt'
model = CCFF()
model.load_state_dict(torch.load(model_location))

In [4]:
res_df = create_filtrations(model, 50, 100)

Filtration: 0
filtration size 654439
Sorting filtration...
Filtration: 1
filtration size 722710
Sorting filtration...
Filtration: 2
filtration size 523027
Sorting filtration...
Filtration: 3
filtration size 530834
Sorting filtration...
Filtration: 4
filtration size 795514
Sorting filtration...
Filtration: 5
filtration size 546249
Sorting filtration...
Filtration: 6
filtration size 629207
Sorting filtration...
Filtration: 7
filtration size 731640
Sorting filtration...
Filtration: 8
filtration size 568319
Sorting filtration...
Filtration: 9
filtration size 659182
Sorting filtration...
Filtration: 10
filtration size 722299
Sorting filtration...
Filtration: 11
filtration size 613026
Sorting filtration...
Filtration: 12
filtration size 652230
Sorting filtration...
Filtration: 13
filtration size 544377
Sorting filtration...
Filtration: 14
filtration size 719386
Sorting filtration...
Filtration: 15
filtration size 554634
Sorting filtration...
Filtration: 16
filtration size 805184
Sorting filt

Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7fe08418e780>>
Traceback (most recent call last):
  File "/home/tgebhart/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
    self._shutdown_workers()
  File "/home/tgebhart/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
    self.worker_result_queue.get()
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/tgebhart/.local/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process

In [5]:
sample_graphs = []
for s in range(res_df.shape[0]):
    print(s)
    subgraphs = {}
    f = res_df['filtration'].iloc[s]
    m = dion.homology_persistence(f)
    dgms = dion.init_diagrams(m,f)
#     dion.plot.plot_diagram(dgms[0], show=True)
    for i,c in enumerate(m):
        if len(c) == 2:
            if f[c[0].index][0] in subgraphs:
                subgraphs[f[c[0].index][0]].add_edge(f[c[0].index][0],f[c[1].index][0],weight=f[i].data)
            else:
                eaten = False
                for k, v in subgraphs.items():
                    if v.has_node(f[c[0].index][0]):
                        v.add_edge(f[c[0].index][0], f[c[1].index][0], weight=f[i].data)
                        eaten = True
                        break
                if not eaten:
                    g = nx.Graph()
                    g.add_edge(f[c[0].index][0], f[c[1].index][0], weight=f[i].data)
                    subgraphs[f[c[0].index][0]] = g
                    
    sample_graphs.append(subgraphs)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [6]:
res_df.head()

Unnamed: 0,class,filtration,loss,prediction
0,9,"[(7147), (7316), (7147, 7316), (7317), (7147, ...",-2.570372,9
1,2,"[(3637), (18666), (3637, 18666), (3589), (1862...",5.579642,2
2,1,"[(7137, 7310), (7137, 7309), (3626), (18655), ...",3.290555,1
3,1,"[(7137), (7137, 7310), (3631), (18660), (3631,...",1.889922,1
4,6,"[(7143, 7308), (3374), (18425), (3374, 18425),...",5.580797,6


In [7]:
res_df[res_df['prediction'] != res_df['class']]

Unnamed: 0,class,filtration,loss,prediction
23,9,"[(3471), (18029), (3471, 18029), (3399), (1796...",-1.834304,5
25,4,"[(3357), (18408), (3357, 18408), (3501), (1854...",0.187749,2
29,3,"[(3322), (18377), (3322, 18377), (3298), (1835...",0.593333,4
40,6,"[(7143, 7308), (3586), (18619), (3586, 18619),...",9.498599,0
42,3,"[(3562), (18597), (3562, 18597), (3538), (1857...",3.927394,6
44,6,"[(3636), (18665), (3636, 18665), (18180), (363...",1.262733,4
48,2,"[(3492), (18533), (3492, 18533), (18555), (349...",-0.280827,4
49,2,"[(3637), (18666), (3637, 18666), (18181), (363...",1.411828,6
50,4,"[(3469), (18512), (3469, 18512), (3493), (1853...",4.850835,2
51,4,"[(3596), (18627), (3596, 18627), (3620), (1864...",0.322314,2


In [None]:
# options = {
#     'node_color': 'red',
#     'node_size': 2,
#     'width': 3,
#     'with_labels':True}
# nx.draw_random(subgraphs[243], **options)

In [None]:
ids = model.layerwise_ids()
ids

In [8]:
goi = 1

In [9]:
print(sample_graphs[goi].keys())

dict_keys([3637, 3589, 3613, 3565, 3541, 3517, 3349, 3493, 3325, 3420, 3396, 3421, 3469, 3301, 3445, 3397, 3262, 3238, 3372, 3277, 3204, 3348, 3180, 3252, 3324, 3228, 3276, 3300, 3214, 3179, 3253, 3203, 7313, 3155, 3661, 3428, 3452, 3190, 3464, 3488, 3404, 3194, 3156, 7316, 3620, 3166, 3229, 3440, 3380, 3590, 3170, 3614, 3499, 3523, 3638, 3585, 3571, 3566, 3595, 3237, 3475, 3542, 3356, 3416, 3596, 3287, 3644, 3154, 3518, 3213, 3561, 3205, 3451, 3188, 3392, 3263, 3366, 3494, 3408, 3572, 3212, 3384, 3129, 3128, 3537, 3236, 3189, 3332, 3489, 3260, 3513, 3284, 3470, 3165, 3427, 3335, 3393, 3323, 3308, 3465, 3446, 3164, 3575, 3131, 3551, 3142, 3127, 3124, 3130, 3441, 3146, 3360, 3125, 3367, 3122, 3422, 3126, 3662, 3643, 3381, 3357, 3558, 3123, 3534, 3527, 3510, 3403, 3386, 3181, 3322, 3388, 3552, 3387, 3486, 3434, 3436, 3554, 3557, 3458, 3435, 3460, 3482, 3555, 3508, 3507, 3553, 3556, 3484, 3459, 3509, 3506, 3437, 3483, 3530, 3485, 3531, 3533, 3411, 3532, 3461, 3576, 3413, 3412, 3462, 3389,

In [None]:
options = {
    'node_color': 'red',
    'node_size': 2,
    'width': 3,
    'with_labels':True}
nx.draw_spring(sample_graphs[goi][7317], **options)

In [None]:
gois = [sample_graphs[goi][k] for k in sample_graphs[goi].keys()]

In [None]:
all_goi = nx.compose_all(gois)

In [None]:
options = {
    'node_color': 'red',
    'node_size': 2,
    'width': 2,
    'with_labels':True}
nx.draw_spring(all_goi, **options)

In [None]:
from node2vec import Node2Vec

In [None]:
take = 10
embedding_info = []
for i in range(len(sample_graphs)):
    print('Sample: {}/{}'.format(i,len(sample_graphs)))
    subs = []
    for k in list(sample_graphs[i].keys())[:take]:
        node2vec = Node2Vec(sample_graphs[i][k], dimensions=64, walk_length=5, num_walks=5, workers=4, quiet=True)
        model = node2vec.fit(window=10, min_count=1, batch_words=4)
        model.wv.save_word2vec_format('../data/node2vec_temp')
        t = pd.read_csv('../data/node2vec_temp', delimiter=' ', names=['node']+list(range(64))).drop(0, axis=0)
        srt = sorted(sample_graphs[i][k].edges(data=True),key= lambda x: x[2]['weight'],reverse=True)
        life = srt[0][2]['weight'] - srt[-1][2]['weight']
        subs.append({'sample':i, 'generator':k, 'lifetime':life,'embedding':t})
    embedding_info.append(subs)

In [None]:
embedding_info[0][5]['embedding'][list(range(64))].values.mean(axis=0)

In [None]:
embedding_info[0][5]

In [None]:
dim = 64
avg_embds = np.zeros(shape=(len(embedding_info),dim))
for i in range(len(embedding_info)):
    print('{}/{}'.format(i, len(embedding_info)))
    gs = embedding_info[i]
    ws = np.zeros(len(gs))
    embds = np.zeros((len(gs),dim))
    for j in range(len(gs)):
        ws[j] = gs[j]['lifetime']
        embds[j,:] = gs[j]['embedding'][list(range(dim))].values.mean(axis=0)
    avg_embds[i,:] = np.dot(ws,embds)/np.sum(ws)

In [None]:
from sklearn.cluster import KMeans
centers = KMeans(n_clusters=10, random_state=0).fit_predict(avg_embds)

In [None]:
res_df.append

In [None]:
cluster_df = res_df.join(pd.DataFrame(centers))
cluster_df.head()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X = avg_embds
y = res_df['class'].values

X_train = avg_embds[:-100,:]
y_train = res_df.iloc[:-100]['class'].values
X_test = avg_embds[-100:,:]
y_test = res_df.iloc[-100:]['class'].values

clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')


In [None]:
cross_val_score(clf, X, y, cv=10)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

In [None]:
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, X, y, cv=10)

In [None]:
h = 0.2
reduced_data = PCA(n_components=2).fit_transform(X)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10)
kmeans.fit(reduced_data)
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

In [None]:
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

In [10]:
take = -1
edges = set()
for i in range(len(sample_graphs)):
    for k in list(sample_graphs[i].keys())[:take]:
        for x in sample_graphs[i][k].edges(data=True):
            edge_name = str(x[0])+'-'+str(x[1])
            edges.add(edge_name)

In [11]:
edf = pd.DataFrame(np.zeros((len(sample_graphs),len(edges))), columns=list(edges))
for i in range(len(sample_graphs)):
    print('Sample: {}/{}'.format(i,len(sample_graphs)))
    for k in list(sample_graphs[i].keys())[:take]:
        for x in sample_graphs[i][k].edges(data=True):
            edge_name = str(x[0])+'-'+str(x[1])
            edf.iloc[i][edge_name] = x[2]['weight']

Sample: 0/100
Sample: 1/100
Sample: 2/100
Sample: 3/100
Sample: 4/100
Sample: 5/100
Sample: 6/100
Sample: 7/100
Sample: 8/100
Sample: 9/100
Sample: 10/100
Sample: 11/100
Sample: 12/100
Sample: 13/100
Sample: 14/100
Sample: 15/100
Sample: 16/100
Sample: 17/100
Sample: 18/100
Sample: 19/100
Sample: 20/100
Sample: 21/100
Sample: 22/100
Sample: 23/100
Sample: 24/100
Sample: 25/100
Sample: 26/100
Sample: 27/100
Sample: 28/100
Sample: 29/100
Sample: 30/100
Sample: 31/100
Sample: 32/100
Sample: 33/100
Sample: 34/100
Sample: 35/100
Sample: 36/100
Sample: 37/100
Sample: 38/100
Sample: 39/100
Sample: 40/100
Sample: 41/100
Sample: 42/100
Sample: 43/100
Sample: 44/100
Sample: 45/100
Sample: 46/100
Sample: 47/100
Sample: 48/100
Sample: 49/100
Sample: 50/100
Sample: 51/100
Sample: 52/100
Sample: 53/100
Sample: 54/100
Sample: 55/100
Sample: 56/100
Sample: 57/100
Sample: 58/100
Sample: 59/100
Sample: 60/100
Sample: 61/100
Sample: 62/100
Sample: 63/100
Sample: 64/100
Sample: 65/100
Sample: 66/100
Sampl

In [12]:
edf.head()

Unnamed: 0,2087-11985,4289-7236,2722-14989,1614-7770,2109-10555,1054-6286,2619-2572,1200-5960,1427-8545,438-1710,...,2283-11220,18325-3266,6425-6403,2347-12223,1291-5579,4900-5432,2377-2088,2262-11222,2893-15630,1517-8604
0,0.0,0.0,0.063712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.196058,0.0,0.0,0.084518,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.195629,0.0,0.199189,0.0,0.0,0.0,0.0,0.0,...,0.387696,1.551071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.250307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006034,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.086977,0.0,0.0,0.0,0.0,0.0,...,0.16792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X = edf.values
y = res_df['class'].values

In [14]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [15]:
clf = svm.SVC(gamma='scale', decision_function_shape='ovo')
cross_val_score(clf, X, y, cv=5)

array([0.29166667, 0.36363636, 0.28571429, 0.27777778, 0.26666667])

In [None]:
y_incorrect = res_df[res_df['class'] != res_df['prediction']]
y_correct = res_df[res_df['class'] == res_df['prediction']]
y_incorrect

In [None]:
X_correct = edf.iloc[y_correct.index]
X_incorrect = edf.iloc[y_incorrect.index]

In [None]:
clf.fit(X_correct.values, y_correct['class'].values) 

In [None]:
inc_preds = clf.predict(X_incorrect.values)

In [None]:
inc_preds

In [None]:
clf.support_vectors_.shape