In [7]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [8]:
import csv
import StringIO

In [9]:
input = sc.textFile("../Data/supervisor_sf.csv")

In [71]:
# csv.DictReader : https://docs.python.org/2/library/csv.html
# As DictReader() is used to read a file obj (string buffer), convert line using StringIO.StringIO to convert a string to a string buffer. 
# more about StringIO : https://docs.python.org/2/library/stringio.html
# If the "fieldnames" parameter is omitted, the values in the first row of the csvfile will be used as the fieldnames.
# Use .next() to get the next item. (Iterator)
def csvLoader(line):
    input = StringIO.StringIO(line)
    reader = csv.DictReader(input, fieldnames=["Zip","Supervisor"]) 
    return reader.next() 

In [73]:
csv_data = input.map(csvLoader)

In [76]:
csv_data.take(5)

[{'Supervisor': '8', 'Zip': '94102'},
 {'Supervisor': '6', 'Zip': '94102'},
 {'Supervisor': '3', 'Zip': '94102'},
 {'Supervisor': '5', 'Zip': '94102'},
 {'Supervisor': '8', 'Zip': '94103'}]

In [75]:
sf_supervisor = csv_data.filter(lambda x :  (x['Zip']).startswith('94'))

In [None]:
sf_supervisor.collect()

In [124]:
# csvWriter : Save in a format of "Supervisor, Zip".
# csv.DictWriter : https://docs.python.org/2/library/csv.html#csv.DictWriter
# Values in the dictionary passed to .writerow() are written to "output".
def csvWriter(rdd):
    output = StringIO.StringIO() # start as an empty string buffer
    writer = csv.DictWriter(output, fieldnames=["Supervisor", "Zip"])
    writer.writerow(rdd)
    return output.getvalue().strip() # strip() : delete \r and \n

In [126]:
csv_data.map(csvWriter).take(5)

['8,94102', '6,94102', '3,94102', '5,94102', '8,94103']

In [118]:
csv_data.map(csvWriter).saveAsTextFile("CSV_folder")