initial commit

zygmuntz · Aug 22, 2013 · 4374d21 · 4374d21
commit 4374d21
Show file tree

Hide file tree

Showing 17 changed files with 888 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,215 @@
+#################
+## Eclipse
+#################
+
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+
+[Dd]ebug/
+[Rr]elease/
+x64/
+build/
+[Bb]in/
+[Oo]bj/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.log
+*.scc
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+*.ncrunch*
+.*crunch*.local.xml
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.Publish.xml
+*.pubxml
+
+# NuGet Packages Directory
+## TODO: If you have NuGet Package Restore enabled, uncomment the next line
+#packages/
+
+# Windows Azure Build Output
+csx
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Others
+sql/
+*.Cache
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.[Pp]ublish.xml
+*.pfx
+*.publishsettings
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+App_Data/*.mdf
+App_Data/*.ldf
+
+#############
+## Windows detritus
+#############
+
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Mac crap
+.DS_Store
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist/
+build/
+eggs/
+parts/
+var/
+sdist/
+develop-eggs/
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
diff --git a/README.md b/README.md
@@ -0,0 +1,92 @@
+phraug2
+=======
+
+A new version of [phraug](https://github.com/zygmuntz/phraug) with improved command line arguments parsing, thanks to jofusa.
+
+
+A set of simple Python scripts for pre-processing large files, things like splitting and format conversion. The names _phraug_ comes from a great book, _Made to Stick_, by Chip and Dan Heath.
+
+See [http://fastml.com/processing-large-files-line-by-line/](http://fastml.com/processing-large-files-line-by-line/) for the basic idea.
+
+There's always at least one input file and usually one or more output files. An input file always stays unchanged.
+
+TODO: documentation
+
+<!--
+Format conversion
+-----------------
+
+`csv2libsvm.py <input file> <output file> [<label index = 0>] [<skip headers = 0>]`
+
+Convert CSV to LIBSVM format. If there are no labels in the input file, specify _label index_ = -1. If there are headers in the input file, specify _skip headers_ = 1.
+
+
+`csv2vw.py <input file> <output file> [<label index = 0>] [<skip headers = 0>]`
+
+Convert CSV to VW format. Arguments as above.
+
+
+`libsvm2csv.py <input file> <output file> <input file dimensionality>`
+
+Convert LIBSVM to CSV. You need to specify dimensionality, that is a number of columns (not counting a label).
+
+
+`libsvm2vw.py <input file> <output file>`
+
+Convert LIBSVM to VW.
+
+
+`tsv2csv.py <input file> <output file>`
+
+Convert tab-separated file to comma-separated file.
+
+
+Column means, standard deviations and normalization
+--------------------------------------------------
+
+How do you normalize (or _standardize_ or _shift and scale_) your data if it doesn't fit into memory? With these two scripts. 
+
+`colstats.py <input file> <output file> [<label index>]`
+
+Compute column means and standard deviations from data in csv file. Can skip label if present. Numbers only. The first line of the output file contains means, the second one standard deviations.
+
+This script uses f_is_headers module, which contains is_headers() function. The purpose of the function is to automatically define if the [first] line in file contains headers.
+
+`normalize.py <stats file> <input file> <output file> [<label index>]`
+
+Normalize (shift and scale to zero mean and unit standard deviation) data from csv file. Meant to be used with column stats file produced by colstats.py. Numbers only.
+
+
+Other operations
+----------------
+
+`chunk.py <input file> <number of output files> [<random seed>]`
+
+Split a file randomly line by line into a number of smaller files. Might be useful for preparing cross-validation. Output files will have the base nume suffixed with a chunk number, for example `data.csv` will be chunked into `data_0.csv`, `data_1.csv` etc.
+
+`count.py <input file>`
+
+Count lines in a file. On Unix you can do it with `wc -l`
+
+`delete_cols.py <input file> <output_file> <indices of columns to delete>`
+`delete_cols.py train.csv train_del.csv 0 2 3`
+
+Delete some columns from a CSV file. Indexes start with 0. Separate them with whitespace.
+
+`sample.py <input file> <output file> [<P = 0.5>]`
+
+Sample lines from an input file with probability P. Similiar to `split.py`, but there's only one output file. Useful for sampling large datasets.
+
+
+`split.py <input file> <output file 1> <output file 2> [<P = 0.9>] [<random seed>]`
+
+Split a file into two randomly. Default P (probability of writing to the first file) is 0.9. You can specify any string as a seed for random number generator.
+
+
+`subset.py <input file> <output file> [<offset = 0>] [<lines = 100>]`
+
+Save a subset of lines from an input file to an output file. Start at _offset_ (default 0), save _lines_ (default 100).
+	
+-->
+
+
diff --git a/chunk.py b/chunk.py
@@ -0,0 +1,47 @@
+'''
+split a file into a given number of chunks randomly, line by line. 
+Usage: chunk.py <input file> <number of chunks> [<seed>]'
+'''
+
+import sys, random, os
+
+input_file = sys.argv[1]
+num_chunks = int( sys.argv[2] )
+
+try:
+	seed = sys.argv[3]
+except IndexError:
+	seed = None
+
+if seed:
+	print "seeding: %s" % ( seed )
+	random.seed( seed )
+
+basename = os.path.basename( input_file )
+basename, ext = os.path.splitext( basename )
+
+i = open( input_file )
+
+os = {}
+for n in range( num_chunks ):
+	output_file = "%s_%s%s" % ( basename, n, ext )
+	os[n] = open( output_file, 'wb' )
+
+counter = 0
+
+for line in i:
+	n = random.randint( 0, num_chunks - 1 )
+	os[n].write( line )
+
+	counter += 1
+	if counter % 100000 == 0:
+		print counter
+
+
+
+
+
+
+
+
+