/
plot.py
94 lines (65 loc) · 2.43 KB
/
plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import altair as alt
import pandas as pd
import numpy as np
MAX_POINTS = 2500
def activate():
pass
def plot_points(df, **kwargs):
activate()
plot_df = df.sample(min(len(df), MAX_POINTS))
return alt.Chart(plot_df).encode(**kwargs).mark_point().interactive()
def plot_pca(df, input_column, **kwargs):
import sklearn.decomposition
DIMENSIONS = 2
activate()
samples = MAX_POINTS
if "func" in kwargs:
func = kwargs["func"]
del kwargs["func"]
else:
func = lambda x: x
a = np.array([func(v) for v in df[input_column].values])
pca_a = sklearn.decomposition.PCA(DIMENSIONS).fit_transform(a)
pca_data = pd.concat([df.reset_index(), pd.DataFrame(pca_a, columns=["x", "y"])], axis=1)
return plot_points(pca_data, **kwargs)
def plot_tsne(df, input_column, **kwargs):
import sklearn.manifold
activate()
samples = MAX_POINTS
if "tsne_sample" in kwargs:
samples = kwargs["tsne_sample"]
del kwargs["tsne_sample"]
sdf = df.sample(samples)
if "func" in kwargs:
func = kwargs["func"]
del kwargs["func"]
else:
func = lambda x: x
a = np.array([func(v) for v in sdf[input_column].values])
tsne = sklearn.manifold.TSNE()
tsne_a = tsne.fit_transform(a)
tsne_plot_data = pd.concat([sdf.reset_index(), pd.DataFrame(tsne_a, columns=["x", "y"])], axis=1)
return plot_points(tsne_plot_data, **kwargs)
def confusion_matrix(actuals, predictions, labels=None, width=215, height=215):
activate()
from sklearn.metrics import confusion_matrix as sk_cm
if labels is None:
from sklearn.utils.multiclass import unique_labels
labels = unique_labels(predictions, actuals)
lc = len(labels)
ccm = sk_cm(actuals, predictions, labels=labels)
ncm = ccm.astype('float') / ccm.sum(axis=1)[:, np.newaxis]
def labelizer(labels):
def labelize(tup):
i, v = tup
return {'predicted': labels[int(i / lc)], 'actual': labels[i % lc], 'raw_count': v[0], 'value': v[1]}
return labelize
labelize = labelizer(labels)
cmdf = pd.DataFrame([labelize(t) for t in enumerate(zip(ccm.ravel(), ncm.ravel()))])
c = alt.Chart(cmdf).mark_rect().encode(
x='predicted:O',
y='actual:O',
color='value:Q',
tooltip=["raw_count:Q"]
).properties(width=width, height=height)
return (cmdf, c)