Browse files

cluster_documents.pig now actually works, clusterer.sh runs

  • Loading branch information...
1 parent b690699 commit e3fdfb33f9b9ca102050b168abd5b97505186315 @thedatachef committed Apr 27, 2011
Showing with 2 additions and 7 deletions.
  1. +1 −1 scripts/document_clustering/cluster_documents.pig
  2. +1 −6 scripts/document_clustering/clusterer.sh
View
2 scripts/document_clustering/cluster_documents.pig
@@ -61,7 +61,7 @@ cut_clusters = FOREACH clusters GENERATE group AS center_id, cut_nearest.vect
-- centers in the next iteration.
--
centroids = FOREACH cut_clusters GENERATE
- group AS center_id,
+ center_id AS center_id,
varaha.text.TermVectorCentroid(vector_collection)
;
View
7 scripts/document_clustering/clusterer.sh
@@ -2,11 +2,6 @@
work_dir=$1 ; shift
-if [ "x$VARAHA_HOME" == "x" ]; then
- echo "Error: VARAHA_HOME is not set."
- exit 1
-fi
-
if [ "$work_dir" == '' ] ; then echo "Please specify the directory containing the K initial centers and tfidf vectors: $0 work_dir [number_of_iterations] [start_iteration]" ; exit ; fi
# How many rounds to run: default 10
@@ -25,5 +20,5 @@ for (( iter=0 ; "$iter" < "$n_iters" ; iter++ )) ; do
echo -e "\n****************************\n"
echo -e "Iteration $(( $iter + 1 )) / $n_iters:\t `basename $curr_iter_file` => `basename $next_iter_file`"
echo -e "\n****************************"
- $VARAHA_HOME/bin/varaha -p TFIDF=$tfidf -p CURR_CENTERS=$curr_iter_file -p NEXT_CENTERS=$next_iter_file $script_dir/cluster_documents.pig
+ pig -p TFIDF=$tfidf -p CURR_CENTERS=$curr_iter_file -p NEXT_CENTERS=$next_iter_file $script_dir/cluster_documents.pig
done

0 comments on commit e3fdfb3

Please sign in to comment.