/
node2vec.html
419 lines (376 loc) · 22.1 KB
/
node2vec.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="Content-Type" content="text/html" charset="UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
<title>Think your Data Different</title>
<meta name="HandheldFriendly" content="True" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="referrer" content="origin" />
<meta name="generator" content="Pelican" />
<link href="https://anotherdatum.com/node2vec.html" rel="canonical" />
<!-- Feed -->
<link href="https://anotherdatum.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate" title="Another Datum Full Atom Feed" />
<link href="https://anotherdatum.com/theme/css/style.css" type="text/css" rel="stylesheet" />
<!-- Code highlight color scheme -->
<link href="https://anotherdatum.com/theme/css/code_blocks/tomorrow.css" rel="stylesheet">
<!-- CSS specified by the user -->
<link href="https://anotherdatum.com/css/overrides.css" type="text/css" rel="stylesheet" />
<!-- Custom fonts -->
<link href='https://fonts.googleapis.com/css?family=Montserrat:400,300' rel='stylesheet' type='text/css' />
<link href="https://fonts.googleapis.com/css?family=Lato" rel="stylesheet" type="text/css" />
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet" type="text/css">
<link href='https://fonts.googleapis.com/css?family=Lora:400,700,400italic,700italic' rel='stylesheet' type='text/css'>
<link href='https://fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800' rel='stylesheet' type='text/css'>
<!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
<script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
<![endif]-->
<meta name="description" content="Learn how node2vec works, and what kind of information it captures that word2vec doesn’t — includes case study.">
<meta name="author" content="Yoel Zeldes">
<meta name="tags" content="deep learning">
<meta name="tags" content="word2vec">
<meta name="tags" content="node2vec">
<!-- Open Graph -->
<meta property="og:site_name" content="Another Datum"/>
<meta property="og:title" content="Think your Data Different"/>
<meta property="og:description" content="Learn how node2vec works, and what kind of information it captures that word2vec doesn’t — includes case study."/>
<meta property="og:locale" content="en_US"/>
<meta property="og:url" content="https://anotherdatum.com/node2vec.html"/>
<meta property="og:type" content="article"/>
<meta property="article:published_time" content="2019-01-21 23:00:00+02:00"/>
<meta property="article:modified_time" content=""/>
<meta property="article:author" content="https://anotherdatum.com/author/yoel-zeldes.html">
<meta property="article:publisher" content="https://www.facebook.com/yoel.zeldes" />
<meta property="article:section" content="node2vec"/>
<meta property="article:tag" content="deep learning"/>
<meta property="article:tag" content="word2vec"/>
<meta property="article:tag" content="node2vec"/>
<meta property="og:image" content="https://anotherdatum.com/images/node2vec/cover.png">
<!-- Twitter Card -->
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:site" content="@YZeldes">
<meta name="twitter:title" content="Think your Data Different">
<meta name="twitter:url" content="https://anotherdatum.com/node2vec.html">
<meta name="twitter:image:src" content="https://anotherdatum.com/images/node2vec/cover.png">
<meta name="twitter:description" content="Learn how node2vec works, and what kind of information it captures that word2vec doesn’t — includes case study.">
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": "Article",
"name": "Think your Data Different",
"headline": "Think your Data Different",
"datePublished": "2019-01-21 23:00:00+02:00",
"dateModified": "",
"author": {
"@type": "Person",
"name": "Yoel Zeldes",
"url": "https://anotherdatum.com/author/yoel-zeldes.html"
},
"image": "https://anotherdatum.com/images/node2vec/cover.png",
"url": "https://anotherdatum.com/node2vec.html",
"description": "Learn how node2vec works, and what kind of information it captures that word2vec doesn\u2019t \u2014 includes case study."
}
</script>
</head>
<!-- TODO : Body class -->
<body class="home-template">
<nav id="menu">
<a class="close-button">Close</a>
<div class="nav-wrapper">
<p class="nav-label">Menu</p>
<ul>
<li><a href="https://anotherdatum.com" role="presentation">Posts</a></li>
<li role="presentation"><a href="https://anotherdatum.com/pages/about.html">about me</a></li>
<li role="presentation"><a href="https://anotherdatum.com/pages/resources.html">Resources</a></li>
</ul>
</div>
</nav>
<!-- Progressbar -->
<div class="progress-container">
<span class="progress-bar"></span>
</div>
<!-- Page Header -->
<!-- Set your background image for this header on the line below. -->
<header id="post-header" class="has-cover">
<div class="inner">
<nav id="navigation">
<span id="home-button" class="nav-button">
<a class="home-button" href="https://anotherdatum.com/" title="Home"><i class="ic ic-arrow-left"></i> Home</a>
</span>
<span id="menu-button" class="nav-button">
<a class="menu-button"><i class="ic ic-menu"></i> Menu</a>
</span>
</nav>
<h1 class="post-title">Think your Data Different</h1>
<!-- TODO : Proper class for headline -->
<span class="post-meta">
<time datetime="21 January 2019">21 January 2019</time>
</span>
<!-- TODO : Modified check -->
<div class="post-cover cover" style="background-image: url('https://anotherdatum.com/images/node2vec/cover.png')">
</div>
</header>
<section id="wrapper">
<a class="hidden-close"></a>
<!-- Post content -->
<main class="content" role="main">
<article class="post">
<div class="inner">
<section class="post-content">
<p>In the last couple of years deep learning (DL) has become a main enabler for
applications in many domains such as vision, NLP, audio, click stream data etc.
Recently researchers started to successfully apply deep learning methods to
graph datasets in domains like social networks, recommender systems and biology,
where data is inherently structured in a graphical way.</p>
<p>So how do Graph Neural Networks work? Why do we need them?</p>
<h1>The Premise of Deep Learning</h1>
<p>In machine learning tasks involving graphical data, we usually want to describe
each node in the graph in a way that allows us to feed it into some machine
learning algorithm. Without DL, one would have to manually extract features,
such as the number of neighbors a node has. But this is a laborious job.</p>
<p>This is where DL shines. It automatically exploits the structure of the graph in
order to extract features for each node. These features are called embeddings.</p>
<p>The interesting thing is, that even if you have absolutely no information about
the nodes, you can still use DL to extract embeddings. The structure of the
graph, that is — the connectivity patterns, hold viable information.</p>
<p>So how can we use the structure to extract information? Can the context of each
node within the graph really help us?</p>
<h1>Learning from Context</h1>
<p>One well known algorithm that extracts information about entities using context
alone is
<a href="https://www.tensorflow.org/tutorials/representation/word2vec">word2vec</a>. The
input to word2vec is a set of sentences, and the output is an embedding for each
word. Similarly to the way text describes the context of each word via the words
surrounding it, graphs describe the context of each node via neighbor nodes.</p>
<p>While in text words appear in linear order, in graphs it’s not the case. There’s
no natural order between neighbor nodes. So we can’t use word2vec... Or can we?</p>
<h1>Reduction like a Badass Mathematician</h1>
<p>We can apply reduction from the graphical structure of our data into a linear
structure such that the information encoded in the graphical structure isn’t
lost. Doing so, we’ll be able to use good old word2vec.</p>
<p>The key point is to perform random walks in the graph. Each walk starts at a
random node, and performs a series of steps, where each step goes to a random
neighbor. Each random walk forms a sentence that can be fed into word2vec. This
algorithm is called <a href="https://snap.stanford.edu/node2vec/">node2vec</a>. There are
more details in the process, which you can read about in the <a href="https://arxiv.org/abs/1607.00653">original
paper</a>.</p>
<hr>
<h1>Case study</h1>
<p>Taboola’s content recommender system gathers lots of data, some of which can be
represented in a graphical manner. Let’s inspect one type of data as a case
study for using node2vec.</p>
<p>Taboola recommends articles in a widget shown in publishers’ websites:</p>
<p><img alt="" src="images/node2vec/widget.png"></p>
<p>Each article has named entities — the entities described by the title. For
example, the item “the cutest dogs on the planet” contains the entities “dog”
and “planet”. Each named entity can appear in many different items.</p>
<p>We can describe this relationship using a graph in the following way: each node
will be a named entity, and there will be an edge between two nodes if the two
named entities appear in the same item:</p>
<p><img alt="" src="images/node2vec/named-entities-graph.png"></p>
<p>Now that we are able to describe our data in a graphical manner, let’s run
node2vec to see what insights we can learn out of the data. You can find the
working code <a href="https://github.com/taboola/node2vec-example">here</a>.</p>
<p>After learning node embeddings, we can use them as features for a downstream
task, e.g. CTR (Click Through Rate) prediction. Although it could benefit the
model, it’ll be hard to understand the qualities learned by node2vec.</p>
<p>Another option would be to cluster similar embeddings together using
<a href="https://en.wikipedia.org/wiki/K-means_clustering">K-means</a>, and color the nodes
according to their associated cluster:</p>
<p><img alt="" src="images/node2vec/node2vec-clusters.png"></p>
<p>Cool! The clusters captured by node2vec seem to be homogeneous. In other words,
nodes that are close to each other in the graph are also close to each other in
the embedding space. Take for instance the orange cluster — all of its named
entities are related to basketball.</p>
<p>You might wonder what is the benefit of using node2vec over classical graphical
algorithms, such as community detection algorithms (e.g., the <a href="https://arxiv.org/abs/cond-mat/0308217">Girvan-Newman
algorithm</a>). Capturing the community
each node belongs to can definitely be done using such algorithms, there’s
nothing wrong with it. Actually, that’s exactly feature engineering. And we
already know that DL can save you the time of carefully handcrafting such
features. So why not enjoy this benefit? We should also keep in mind that
node2vec learns high dimensional embeddings. These embeddings are much richer
than merely community belonging.</p>
<h1>Taking Another Approach</h1>
<p>Using node2vec in this use case might not be the first idea that comes to mind.
One might suggest to simply use word2vec, where each sentence is the sequence of
named entities inside a single item. In this approach we don’t treat the data as
having a graphical structure. So what’s the difference between this approach —
which is valid, and node2vec?</p>
<p>If we think about it, each sentence we generate in the word2vec approach is a
walk in the graph we’ve defined earlier. node2vec also defines walks on the same
graph. So they are the same, right? Let’s have a look at the clusters we get by
the word2vec approach:</p>
<p><img alt="" src="images/node2vec/word2vec-clusters.png"></p>
<p>Now the “basketball” cluster is less homogenous — it contains both orange and
blue nodes. The named entity “Basketball” for example was colored orange, while
the basketball players “Lebron James” and “Kobe Bryant” were colored blue!</p>
<p><img alt="" src="images/node2vec/word2vec-labeled.png"></p>
<p>But why did this happen?</p>
<p>In this approach each walk in the graph is composed only of named entities that
appear together in a single item. It means we are limited to walks that don’t go
further than distance 1 from the starting node. In node2vec, we don’t have that
limit.<br>
Since each approach uses a different kind of walks, the learned
embeddings capture a different kind of information.</p>
<p>To make it more concrete, consider the following example: say we have two items
— one with named entities A, B, C and another with D, B, E. These items induce
the following graph:</p>
<p><img alt="" src="images/node2vec/graph.png"></p>
<p>In the simple word2vec approach we’ll generate the following sentences: [A, B,
C] and [D, B, E]. In the node2vec approach we could also get sentences like [A,
B, E]. If we fetch the latter into the training process, we’ll learn that E and
C are interchangeable: the prefix [A, B] will be able to predict both C and E.
Therefore, C and E will get similar embeddings, and will be clustered together.</p>
<hr>
<h1>Takeway</h1>
<p>Using the right data structure to represent your data is important. Each data
structure implies a different learning algorithm, or in other words — introduces
a different inductive bias.</p>
<p>Identifying your data has a certain structure, so you can use the right tool for
the job, might be challenging.</p>
<p>Since so many real world datasets are naturally represented as graphs, we think
Graph Neural Networks are a must-have in our tool box as data scientists.</p>
<hr>
<p><em>Originally published at
</em><a href="https://engineering.taboola.com/think-data-different">engineering.taboola.com</a>
by me and <a href="https://medium.com/@kozohar">Zohar Komarovsky</a>.</p>
</section>
<section class="post-info">
<div class="post-share">
<a class="twitter" href="https://twitter.com/share?text=Think your Data Different&url=https://anotherdatum.com/node2vec.html" onclick="window.open(this.href, 'twitter-share', 'width=550,height=235');return false;">
<i class="ic ic-twitter"></i><span class="hidden">Twitter</span>
</a>
<a class="facebook" href="https://www.facebook.com/sharer/sharer.php?u=https://anotherdatum.com/node2vec.html" onclick="window.open(this.href, 'facebook-share','width=580,height=296');return false;">
<i class="ic ic-facebook"></i><span class="hidden">Facebook</span>
</a>
<div class="clear"></div>
</div>
<aside class="post-tags">
<a href="https://anotherdatum.com/tag/deep-learning.html">deep learning</a><a href="https://anotherdatum.com/tag/word2vec.html">word2vec</a><a href="https://anotherdatum.com/tag/node2vec.html">node2vec</a> </aside>
<div class="clear"></div>
</section>
<!-- Begin MailChimp Signup Form -->
<link href="//cdn-images.mailchimp.com/embedcode/classic-10_7.css" rel="stylesheet" type="text/css">
<style type="text/css">
#mc_embed_signup{background:#fff; clear:left; font:14px Helvetica,Arial,sans-serif; width:300px;}
#mc_embed_signup form{padding: 0;}
/* Add your own MailChimp form style overrides in your site stylesheet or in this style block.
We recommend moving this block and the preceding CSS link to the HEAD of your HTML file. */
</style>
<div id="mc_embed_signup">
<form action="https://anotherdatum.us14.list-manage.com/subscribe/post?u=6894d7badcfb253606fa3fb54&id=c6f34ad6b7" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate" target="_blank" novalidate>
<div id="mc_embed_signup_scroll">
<h2>Get updated of new posts</h2>
<div class="mc-field-group">
<label for="mce-EMAIL">Email Address </label>
<input type="email" value="" name="EMAIL" class="required email" id="mce-EMAIL">
</div>
<div id="mce-responses" class="clear">
<div class="response" id="mce-error-response" style="display:none"></div>
<div class="response" id="mce-success-response" style="display:none"></div>
</div> <!-- real people should not fill this in and expect good things - do not remove this or risk form bot signups-->
<div style="position: absolute; left: -5000px;" aria-hidden="true"><input type="text" name="b_6894d7badcfb253606fa3fb54_c6f34ad6b7" tabindex="-1" value=""></div>
<div class="clear"><input type="submit" value="Subscribe" name="subscribe" id="mc-embedded-subscribe" class="button"></div>
</div>
</form>
</div>
<script type='text/javascript' src='//s3.amazonaws.com/downloads.mailchimp.com/js/mc-validate.js'></script><script type='text/javascript'>(function($) {window.fnames = new Array(); window.ftypes = new Array();fnames[0]='EMAIL';ftypes[0]='email';fnames[1]='FNAME';ftypes[1]='text';fnames[2]='LNAME';ftypes[2]='text';}(jQuery));var $mcj = jQuery.noConflict(true);</script>
<!--End mc_embed_signup-->
<hr />
<aside class="post-nav">
<a class="post-nav-next" href="https://anotherdatum.com/preparing-for-the-unexpected.html">
<section class="post-nav-teaser">
<i class="ic ic-arrow-left"></i>
<h2 class="post-nav-title">Preparing for the Unexpected</h2>
<p class="post-nav-excerpt">How to apply your model to input it has never seen before.</p>
</section>
</a>
<a class="post-nav-prev" href="https://anotherdatum.com/branding.html">
<section class="post-nav-teaser">
<i class="ic ic-arrow-right"></i>
<h2 class="post-nav-title">How to Build Your Personal Brand as a Data Scientist</h2>
<p class="post-nav-excerpt">A couple of months ago I embarked on a journey to build my personal brand as a data...</p>
</section>
</a>
<div class="clear"></div>
</aside>
<div class="comments">
<h2>Comments !</h2>
<div id="disqus_thread"></div>
<script type="text/javascript">
var disqus_shortname = 'anotherdatum';
var disqus_identifier = 'node2vec.html';
var disqus_url = 'https://anotherdatum.com/node2vec.html';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//anotherdatum.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
<noscript>Please enable JavaScript to view the comments.</noscript>
</div>
</div>
</article>
</main>
<!-- TODO : Body class -->
<div id="body-class" style="display: none;" class=""></div>
<footer id="footer">
<div class="social">
<a href="https://il.linkedin.com/in/yoelzeldes">
<span class="fa-stack fa-lg">
<i class="fa fa-circle fa-stack-2x"></i>
<i class="fa fa-linkedin fa-stack-1x fa-inverse"></i>
</span>
</a>
<a href="https://github.com/yoel-zeldes">
<span class="fa-stack fa-lg">
<i class="fa fa-circle fa-stack-2x"></i>
<i class="fa fa-github fa-stack-1x fa-inverse"></i>
</span>
</a>
<a href="https://www.facebook.com/yoel.zeldes">
<span class="fa-stack fa-lg">
<i class="fa fa-circle fa-stack-2x"></i>
<i class="fa fa-facebook fa-stack-1x fa-inverse"></i>
</span>
</a>
<a href="https://twitter.com/YZeldes">
<span class="fa-stack fa-lg">
<i class="fa fa-circle fa-stack-2x"></i>
<i class="fa fa-twitter fa-stack-1x fa-inverse"></i>
</span>
</a>
</div>
<div class="inner">
<section class="credits">
<span class="credits-theme">Have a look at <a href="https://github.com/yoel-zeldes/yoel-zeldes.github.io/tree/source">the source code</a> of this blog.</span>
</section>
</div>
</footer>
</section>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
<script type="text/javascript" src="https://anotherdatum.com/theme/js/script.js"></script>
<!-- Global Site Tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-83684090-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-83684090-1', { 'anonymize_ip': true });
</script>
<script type="text/javascript">
var disqus_shortname = 'anotherdatum';
(function () {
var s = document.createElement('script'); s.async = true;
s.type = 'text/javascript';
s.src = '//' + disqus_shortname + '.disqus.com/count.js';
(document.getElementsByTagName('HEAD')[0] || document.getElementsByTagName('BODY')[0]).appendChild(s);
}());
</script>
</body>
</html>