Skip to content
Browse files

cleaned up and organized old network examples, added new example to n…

…ormalize numeric column of a bag of tuples
  • Loading branch information...
1 parent 4606b96 commit d7b966c8b006e18fb41d0692cfbe49688ee1ac1e @thedatachef committed Jun 29, 2010
View
0 seinfeld_network.tsv → data/seinfeld_network.tsv
File renamed without changes.
View
8 data/timeseries.tsv
@@ -0,0 +1,8 @@
+200912 12
+201001 9
+201002 2
+201003 7
+201004 13
+201005 10
+201006 5
+201007 8
View
24 degree_dist_bug.pig
@@ -1,24 +0,0 @@
-%default NETWORK 'seinfeld_network.tsv'
-%default GRAPHCOUNTS 'seinfeld_degree_dist.tsv'
-
-links = LOAD '$NETWORK' AS (node_a:chararray, node_b:chararray);
--- get out degree distribution
-out_list = GROUP links BY node_a;
-out_counts = FOREACH out_list GENERATE group AS node, COUNT(links) AS num_out_links;
-
--- get in degree distribution
-in_list = GROUP links BY node_b;
-in_counts = FOREACH in_list GENERATE group AS node, COUNT(links) AS num_in_links;
-
--- join together
-joined = JOIN out_counts BY node, in_counts BY node;
-node_counts = FOREACH joined GENERATE
- out_counts::node AS node,
- out_counts::num_out_links AS num_out_links,
- in_counts::num_in_links AS num_in_links,
- ((float)num_out_links/(float)num_in_links) AS ratio
- ;
-ordered = ORDER node_counts BY ratio DESC;
-
-rmf $GRAPHCOUNTS;
-STORE ordered INTO '$GRAPHCOUNTS';
View
22 degree_distribution.pig
@@ -1,5 +1,5 @@
-%default PAIRS 'seinfeld_network.tsv'
-%default GRAPHCOUNTS 'seinfeld_network_deg_dist.tsv'
+%default PAIRS 'data/seinfeld_network.tsv'
+%default GRAPHCOUNTS 'data/seinfeld_network_deg_dist.tsv'
pairs = LOAD '$PAIRS' AS (node_a:chararray, node_b:chararray);
out_list = GROUP pairs BY node_a;
@@ -8,15 +8,15 @@ in_list = GROUP pairs BY node_b;
in_counts = FOREACH in_list GENERATE group AS node, COUNT(pairs) AS num_in_links;
joined = JOIN out_counts BY node FULL OUTER, in_counts BY node;
node_counts = FOREACH joined
- {
- node_name = (out_counts::node IS NOT NULL?out_counts::node:in_counts::node);
- GENERATE
- node_name AS node,
- out_counts::num_out_links AS num_out_links,
- in_counts::num_in_links AS num_in_links,
- ((float)num_out_links/(float)num_in_links) AS ratio
- ;
-};
+ {
+ node_name = (out_counts::node IS NOT NULL?out_counts::node:in_counts::node);
+ GENERATE
+ node_name AS node,
+ out_counts::num_out_links AS num_out_links,
+ in_counts::num_in_links AS num_in_links,
+ ((float)num_out_links/(float)num_in_links) AS ratio
+ ;
+ };
ordered = ORDER node_counts BY ratio DESC;
rmf $GRAPHCOUNTS;
View
0 pig-mode.el → emacs/pig-mode.el
File renamed without changes.
View
4 extract_n1_clique.pig
@@ -1,5 +1,5 @@
-%default NETWORK 'seinfeld_network.tsv'
-%default OUT_CLIQUE 'seinfeld_n1_clique'
+%default NETWORK 'data/seinfeld_network.tsv'
+%default OUT_CLIQUE 'data/seinfeld_n1_clique'
%default N0_SEED 'kramer'
links = LOAD '$NETWORK' AS (node_a:chararray, node_b:chararray);
View
11 normalize_databag.pig
@@ -0,0 +1,11 @@
+--
+-- Read in a bag of tuples (timeseries for this example) and divide the
+-- numeric column by its maximum.
+--
+%default DATABAG 'data/timeseries.tsv'
+
+data = LOAD '$DATABAG' AS (month:chararray, count:int);
+accumulate = GROUP data ALL;
+calc_max = FOREACH accumulate GENERATE FLATTEN(data), MAX(data.count) AS max_count;
+normalize = FOREACH calc_max GENERATE data::month AS month, data::count AS count, (float)data::count / (float)max_count AS normed_count;
+DUMP normalize;
View
0 talk.odp → notes/talk.odp
File renamed without changes.
View
5 symmetrize_links.pig
@@ -1,5 +1,6 @@
-%default NETWORK 'seinfeld_network.tsv'
-%default SYM 'seinfeld_network_symmetric.tsv'
+%default NETWORK 'data/seinfeld_network.tsv'
+%default SYM 'data/seinfeld_network_symmetric.tsv'
+
links = LOAD '$NETWORK' AS (node_a:chararray, node_b:chararray);
ordered = FOREACH links -- order pairs alphabetically
{

0 comments on commit d7b966c

Please sign in to comment.
Something went wrong with that request. Please try again.