Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 47 lines (42 sloc) 1.5 KB
# divide data into 10 buckets
import random
def buckets(filename, bucketName, separator, classColumn):
"""the original data is in the file named filename
bucketName is the prefix for all the bucket names
separator is the character that divides the columns
(for ex., a tab or comma and classColumn is the column
that indicates the class"""
# put the data in 10 buckets
numberOfBuckets = 10
data = {}
# first read in the data and divide by category
with open(filename) as f:
lines = f.readlines()
for line in lines:
if separator != '\t':
line = line.replace(separator, '\t')
# first get the category
category = line.split()[classColumn]
data.setdefault(category, [])
data[category].append(line)
# initialize the buckets
buckets = []
for i in range(numberOfBuckets):
buckets.append([])
# now for each category put the data into the buckets
for k in data.keys():
#randomize order of instances for each class
random.shuffle(data[k])
bNum = 0
# divide into buckets
for item in data[k]:
buckets[bNum].append(item)
bNum = (bNum + 1) % numberOfBuckets
# write to file
for bNum in range(numberOfBuckets):
f = open("%s-%02i" % (bucketName, bNum + 1), 'w')
for item in buckets[bNum]:
f.write(item)
f.close()
# example of how to use this code
buckets("pimaSmall.txt", 'pimaSmall',',',8)