Skip to content
Browse files

Updating pig scripts

  • Loading branch information...
1 parent a482ab2 commit 121b57828b04c5c9b8cf61b63bae2fb77050a7c5 @xstevens xstevens committed
View
21 src/main/pig/generate_ngram_feature_index.pig
@@ -1,22 +1,25 @@
-register './akela-0.1.jar'
+register './akela-0.2-SNAPSHOT.jar'
+register './grouperfish-0.3-SNAPSHOT.jar'
register './lib/lucene-core-3.1.0.jar'
register './lib/lucene-analyzers-3.1.0.jar'
SET default_parallel 7;
-SET pig.splitCombination 'false';
-%default INPUT 'opinions.tsv'
+%default INPUT 'hbase://grouperfish'
%default STOPWORDS 'stopwords-en.txt'
%default STEM 'false'
%default FREQ_OUTPUT 'ngram-feature-freq'
%default OUTPUT 'ngram-feature-index'
-raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,language:chararray,text:chararray);
+raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;
+genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
+/*raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,language:chararray,text:chararray);*/
+
grouped_raw = GROUP raw ALL;
ndocs = FOREACH grouped_raw GENERATE COUNT(raw);
/* Get all of the unigrams */
-tokenized = FOREACH raw GENERATE doc_id,FLATTEN(com.mozilla.pig.eval.text.Tokenize(text, '$STOPWORDS', '$STEM')) AS token:chararray;
+tokenized = FOREACH genmap GENERATE doc_id,FLATTEN(com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text', '$STOPWORDS', '$STEM')) AS token:chararray;
grouped_words = GROUP tokenized BY token;
word_freq = FOREACH grouped_words GENERATE FLATTEN($0) AS word:chararray, COUNT($1) as count;
/* filter on minDF = (count) > 10 AND maxDF % = (count/ndocs) < 0.9 */
@@ -24,7 +27,7 @@ filtered_freq = FILTER word_freq BY SIZE(word) > 1 AND count > 10 AND ((double)c
unigram_index = FOREACH filtered_freq GENERATE word;
/* Get all of the bi-grams */
-bigram_tokenized = FOREACH raw GENERATE doc_id,FLATTEN(com.mozilla.pig.eval.text.NGramTokenize(text, '$STOPWORDS', '$STEM', 'false', '2', '2')) AS token:chararray;
+bigram_tokenized = FOREACH genmap GENERATE doc_id,FLATTEN(com.mozilla.grouperfish.pig.eval.text.NGramTokenize(json_map#'text', '$STOPWORDS', '$STEM', 'false', '2', '2')) AS token:chararray;
bigram_grouped_words = GROUP bigram_tokenized BY token;
bigram_word_freq = FOREACH bigram_grouped_words GENERATE FLATTEN($0) AS ngram:chararray, COUNT($1) as count;
/* filter on minDF = (count) > 100 AND maxDF % = (count/ndocs) < 0.9 */
@@ -32,7 +35,7 @@ bigram_filtered_freq = FILTER bigram_word_freq BY SIZE(ngram) > 3 AND count > 10
bigram_index = FOREACH bigram_filtered_freq GENERATE ngram;
/* Get all of the tri-grams */
-trigram_tokenized = FOREACH raw GENERATE doc_id,FLATTEN(com.mozilla.pig.eval.text.NGramTokenize(text, '$STOPWORDS', '$STEM', 'false', '3', '3')) AS token:chararray;
+trigram_tokenized = FOREACH genmap GENERATE doc_id,FLATTEN(com.mozilla.grouperfish.pig.eval.text.NGramTokenize(json_map#'text', '$STOPWORDS', '$STEM', 'false', '3', '3')) AS token:chararray;
trigram_grouped_words = GROUP trigram_tokenized BY token;
trigram_word_freq = FOREACH trigram_grouped_words GENERATE FLATTEN($0) AS ngram:chararray, COUNT($1) as count;
/* filter on minDF = (count) > 100 AND maxDF % = (count/ndocs) < 0.9 */
@@ -40,7 +43,7 @@ trigram_filtered_freq = FILTER trigram_word_freq BY SIZE(ngram) > 5 AND count >
trigram_index = FOREACH trigram_filtered_freq GENERATE ngram;
/* Discard bigrams that are represented by trigrams */
-bigrams_from_trigrams = FOREACH trigram_index GENERATE FLATTEN(com.mozilla.pig.eval.text.NGramTokenize(ngram, '$STOPWORDS', '$STEM', 'false', '2', '2')) AS token:chararray;
+bigrams_from_trigrams = FOREACH trigram_index GENERATE FLATTEN(com.mozilla.grouperfish.pig.eval.text.NGramTokenize(ngram, '$STOPWORDS', '$STEM', 'false', '2', '2')) AS token:chararray;
filtered_bigrams_from_trigrams = FILTER bigrams_from_trigrams BY SIZE(token) > 3;
uniq_bigrams_from_trigrams = DISTINCT filtered_bigrams_from_trigrams;
@@ -52,7 +55,7 @@ final_bigram_index = FOREACH symm_diff_bigrams GENERATE group AS ngram:chararray
/* Discard unigrams that are represented by trigrams or bigrams */
bigrams_and_trigrams = UNION final_bigram_index, trigram_index;
-unigrams_from_ngrams = FOREACH bigrams_and_trigrams GENERATE FLATTEN(com.mozilla.pig.eval.text.Tokenize(ngram, '$STOPWORDS', '$STEM')) AS word:chararray;
+unigrams_from_ngrams = FOREACH bigrams_and_trigrams GENERATE FLATTEN(com.mozilla.grouperfish.pig.eval.text.Tokenize(ngram, '$STOPWORDS', '$STEM')) AS word:chararray;
filtered_unigrams_from_ngrams = FILTER unigrams_from_ngrams BY SIZE(word) > 1;
uniq_unigrams_from_ngrams = DISTINCT filtered_unigrams_from_ngrams;
View
17 src/main/pig/generate_tf_document_vectors.pig
@@ -1,4 +1,4 @@
-register './akela-0.1.jar'
+register './akela-0.2-SNAPSHOT.jar'
register './lib/lucene-core-3.1.0.jar'
register './lib/lucene-analyzers-3.1.0.jar'
register './lib/mahout-core-0.5.jar'
@@ -7,25 +7,26 @@ register './lib/mahout-utils-0.5.jar'
register './lib/mahout-collections-1.0.jar'
SET default_parallel 7;
-SET pig.splitCombination 'false';
-%default INPUT 'opinions.tsv'
+%default INPUT 'hbase://grouperfish'
%default STOPWORDS 'stopwords-en.txt'
%default STEM 'true'
%default FEATUREINDEX 'feature-index'
%default OUTPUT 'document-vectors-tf'
-raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray);
-filtered_raw = FILTER raw BY locale == 'en-US' AND praise_issue == 'issue' AND version == '5.0';
-tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.Tokenize(text,'$STOPWORDS', '$STEM') AS token_bag;
+raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;
+genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
+/*raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray);*/
+
+tokenized = FOREACH genmap GENERATE doc_id,com.grouperfish.mozilla.pig.eval.text.Tokenize(json_map#'text','$STOPWORDS', '$STEM') AS token_bag;
/* Comment out the line above and uncomment the line below if you are using an ngram feature-index */
/*tokenized = FOREACH filtered_raw GENERATE doc_id,com.mozilla.pig.eval.text.NGramTokenize(text,'$STOPWORDS', '$STEM', 'true') AS token_bag;*/
filtered_tokenized = FILTER tokenized BY SIZE(token_bag) > 1;
-doc_vectors = FOREACH filtered_tokenized GENERATE doc_id,com.mozilla.pig.eval.text.TermFrequency(token_bag) AS tf_bag;
+doc_vectors = FOREACH filtered_tokenized GENERATE doc_id,com.mozilla.grouperfish.pig.eval.text.TermFrequency(token_bag) AS tf_bag;
/* Put things back into document vector form before storing in Mahout's vector format */
feature_vectors = FOREACH doc_vectors GENERATE (chararray)doc_id,com.mozilla.pig.eval.ml.TFVectorizer('$FEATUREINDEX', tf_bag) AS vec;
-STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.pig.storage.DocumentVectorStorage('$NFEATURES');
+STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.grouperfish.pig.storage.DocumentVectorStorage('$NFEATURES');
/* Run Mahout's Clustering on this output */
/*
View
19 src/main/pig/generate_tfidf_document_vectors.pig
@@ -7,19 +7,20 @@ register './lib/mahout-utils-0.5.jar'
register './lib/mahout-collections-1.0.jar'
SET default_parallel 7;
-SET pig.splitCombination 'false';
-%default INPUT 'opinions.tsv'
+%default INPUT 'hbase://grouperfish'
%default STOPWORDS 'stopwords-en.txt'
%default STEM 'true'
%default FEATUREINDEX 'feature-index'
%default OUTPUT 'document-vectors-tfidf'
-raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray);
-filtered_raw = FILTER raw BY locale == 'en-US' AND praise_issue == 'issue' AND version == '5.0';
-group_filtered = GROUP filtered_raw all;
-ndocs = FOREACH group_filtered GENERATE COUNT(filtered_raw);
-tokenized = FOREACH filtered_raw GENERATE doc_id,FLATTEN(com.mozilla.pig.eval.text.Tokenize(text,'$STOPWORDS', '$STEM')) AS token:chararray;
+raw = LOAD 'hbase://grouperfish' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:json') AS json:chararray;
+genmap = FOREACH raw GENERATE com.mozilla.pig.eval.json.JsonMap(json) AS json_map:map[];
+/*raw = LOAD '$INPUT' USING PigStorage('\t') AS (doc_id:int,datetime:long,praise_issue:chararray,product:chararray,version:chararray,os:chararray,locale:chararray,text:chararray);*/
+
+grouped_raw = GROUP raw all;
+ndocs = FOREACH grouped_raw GENERATE COUNT(raw);
+tokenized = FOREACH genmap GENERATE doc_id,FLATTEN(com.mozilla.grouperfish.pig.eval.text.Tokenize(json_map#'text','$STOPWORDS', '$STEM')) AS token:chararray;
/* Comment out the line above and uncomment the line below if you are using an ngram feature-index */
/*tokenized = FOREACH filtered_raw GENERATE doc_id,FLATTEN(com.mozilla.pig.eval.text.NGramTokenize(text,'$STOPWORDS', '$STEM', 'true')) AS token:chararray;*/
@@ -58,9 +59,9 @@ tfidf_all = FOREACH token_usages {
/* Put things back into document vector form before storing in Mahout's vector format */
doc_vectors = GROUP tfidf_all BY doc_id;
-feature_vectors = FOREACH doc_vectors GENERATE (chararray)group AS doc_id,com.mozilla.pig.eval.ml.TFIDFVectorizer('$FEATUREINDEX', $1) AS vec;
+feature_vectors = FOREACH doc_vectors GENERATE (chararray)group AS doc_id,com.mozilla.grouperfish.pig.eval.ml.TFIDFVectorizer('$FEATUREINDEX', $1) AS vec;
-STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.pig.storage.DocumentVectorStorage('$NFEATURES');
+STORE feature_vectors INTO '$OUTPUT' USING com.mozilla.grouperfish.pig.storage.DocumentVectorStorage('$NFEATURES');
/* Run Mahout's Clustering on this output */
/*

0 comments on commit 121b578

Please sign in to comment.
Something went wrong with that request. Please try again.