Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added note to README and smaller changes

  • Loading branch information...
commit d430956df2533bcfce19570c1d24e17b0f69ad3b 1 parent 0ee95f6
Gabor Szabo authored
View
4 README.md
@@ -1,3 +1,7 @@
+**NOTE AS OF 4/7/2012: We are in the process of making significant changes to
+the code base, revamping the documentation, and adding a tutorial. Please check
+back in a few days for these as well.**
+
PyCascading
===========
View
6 examples/cache.py
@@ -59,7 +59,11 @@ def main():
# Select the lines beginning with 'A', and save this intermediate result
# in the cache so that we can call the script several times with
# different separator characters
- p = flow.cache('line_begins') | (input | find_lines_with_beginning('A'))
+ p = input | find_lines_with_beginning('A')
+ # Checkpoint the results from 'p' into a cache folder named 'line_begins'
+ # The caches are in the user's HDFS folder, under pycascading.cache/
+ p = flow.cache('line_begins') | p
+ # Everything goes to one reducer
p | GroupBy(Fields.VALUES) | concat_all(sys.argv[1]) | output
flow.run()
View
34 examples/groups.py
@@ -1,34 +0,0 @@
-#
-# Copyright 2011 Twitter, Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-
-The data is expected in the pycascading_data/ folder if run in local mode,
-and in the pycascading_data/ folder in the user's HDFS home if run with Hadoop.
-"""
-
-from pycascading.helpers import *
-
-
-def main():
- flow = Flow()
- repeats = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
- [String, Integer]),
- 'pycascading_data/repeats.txt'))
- output = flow.tsv_sink('pycascading_data/out')
-
- repeats | GroupBy('col1') | Every(aggregator=Count(), output_selector=Fields.) | output
-
- flow.run()
View
6 examples/reduce.py
@@ -5,7 +5,7 @@
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -53,8 +53,8 @@ def main():
flow = Flow()
input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
output = flow.tsv_sink('pycascading_data/out')
-
+
p = input | starts_with_letter('A') | word_count
p | GroupBy('word_count') | count | output
-
+
flow.run()
View
1  examples/udf_contexts.py
@@ -40,6 +40,7 @@ def main():
input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
output = flow.tsv_sink('pycascading_data/out')
+ # Retain only lines that start with an 'A' or 'T'
input | starts_with_letters(set(['A', 'T'])) | SelectFields('line') | output
flow.run()
View
6 python/pycascading/bootstrap.py
@@ -37,6 +37,8 @@
# mode, and the PyCascading tarball in Hadoop mode
python_dir = sys.argv[2]
+ # Remove the first two arguments so that sys.argv will look like as
+ # if it was coming from a simple command line execution
# The further parameters are the command line parameters to the script
sys.argv = sys.argv[3:]
@@ -83,10 +85,6 @@
pycascading.pipe.config['pycascading.running_mode'] = running_mode
pycascading.pipe.config['pycascading.main_file'] = args[0]
- # Remove the running mode argument so that sys.argv will look like as
- # if it was coming from a simple command line execution
- sys.argv = args[2:]
-
_main_module_ = imp.load_source('__main__', \
pycascading.pipe.config['pycascading.main_file'])
_main_module_.main()
Please sign in to comment.
Something went wrong with that request. Please try again.