In [1]:
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location

from pyspark import SparkContext
sc = SparkContext()

# 41

In [8]:
sc.textFile('Working Files/07/sample_inputs').collect()
# RDD with n elements where each element is a separate line of the txt file;
# All files in the specified directory gets concatenated

['This is some test data.',
 'That takes more than one line.',
 'This is a second test file.',
 'Is it also',
 'a multiline file.']

In [9]:
sc.wholeTextFiles('Working Files/07/sample_inputs').collect()
# List of pairs, where first element of a tuple is a filename, and the second one is the whole content of the file

[('file:/home/victor/Desktop/Spark/Working Files/07/sample_inputs/test.txt',
  'This is some test data.\nThat takes more than one line.\n'),
 ('file:/home/victor/Desktop/Spark/Working Files/07/sample_inputs/test2.txt',
  'This is a second test file.\nIs it also\na multiline file.\n')]

wholeTextFiles works best with the small files;

textFile allows to better parallelize (as we can use more partitions);

minPartitions optional argument can be specified;

useUnicode optional argument can be specified; this can make processing slightly faster (but you should be sure, that your documents do not contain weird symbols)

# 42

In [11]:
sc.parallelize(['a', 1, {'some_key': 'some_value'}]).saveAsPickleFile('Working Files/07/pickled_file')

In [13]:
sc.pickleFile('Working Files/07/pickled_file').collect()
# we didn't get the strings or jsons or anything else, but we get a list of RDD objects, which do not require any
# additional parsing

['a', 1, {'some_key': 'some_value'}]

In [15]:
# even classes can be pickled, but only those that are at the top level of the module that it is in.
# Nested or dynamically created classes can not be pickled

The more data we serialize the longer it takes to save and read.

In [16]:
# Thrift can be used for better performance, but requires specifying schema beforehand. 
# Can be read from different languages.

# 43

org.apache.hadoop.mapred -> old API

org.apache.hadoop.mapreduce -> new API

if old API:

- sc.hadoopFile

- sc.hadoopRDD

if new API:

- sc.newAPIHadoopFile

- sc.newAPIHadoopRDD

hadoopFile command is used when we provide a file to open.

In [18]:
rdd = sc.newAPIHadoopFile(path='Working Files/07/test.txt',
                          inputFormatClass='org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
                          keyClass='org.apache.hadoop.io.LongWritable',
                          valueClass='org.apache.hadoop.io.Text',
                          keyConverter=None,
                          valueConverter=None,
                          conf={},
                          batchSize=0) # -> how many python objects should be represented as a single Java object
                                       # 0 -> decision will be made automatically

In [19]:
rdd.collect()

[(0, 'This is some test data.'), (24, 'That takes more than one line.')]

# 44

Old API -> org.apache.hadoop.mapred

- saveAsHadoopDataset
- saveAsHadoopFile

New API -> org.apache.hadoop.mapreduce

- saveAsNewAPIHadoopDataset
- saveAsNewAPIHadoopFile

In [None]:
rdd.saveAsHadoopDataset(conf, keyConverter, valueConverter)

In [None]:
rdd.saveAsHadoopFile(path,
                     outputFormatClass,  
                     keyClass, 
                     valueClass,
                     keyConverter,
                     valueConverter,
                     conf,
                     compressionCodecClass)