# Initialize SparkContext

In [1]:
%%time
from pyspark import SparkContext

sc = SparkContext()

CPU times: user 180 ms, sys: 24 ms, total: 204 ms
Wall time: 14.4 s


In [6]:
conf=sc.getConf()
# Conf.getAll()
conf.get("spark.executor.instances")
conf.get("spark.executor.cores")

u'1'

# Local files

### Download and Uplaod

Download and upload files via the Spark Notebook interface.

### Access Local Files

The file path to local files requires `file://` prefix.

In [7]:
ls /etc/passwd

/etc/passwd


# s3helper

The object `s3helper` is a tool to transfer files between local filesystem, HDFS and S3.

Run `s3helper.help()` to learn all its methods.

In [8]:
# coding: utf-8
from boto.exception import S3ResponseError
from boto.s3.connection import S3Connection
import os
import subprocess


def _run_command(command, detail=False):
    proc = subprocess.Popen(command.split(),
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    if detail:
        while True:
            line = proc.stdout.readline()
            if not line:
                break
            print(line)
    return proc.communicate()


def _list_hdfs(path):
    command = "/usr/bin/hdfs dfs -ls %s" % path
    out, err = _run_command(command)
    if out:
        print(out)


def _s3_to_hdfs(files, tgt):
    out, err1 = _run_command("/usr/bin/hdfs dfs -mkdir -p %s" % tgt)
    if err1:
        print(err1)
        return
    out, err2 = _run_command("/usr/bin/hdfs dfs -cp %s %s" % (files, tgt))
    if err2:
        print(err2)


def _hdfs_to_s3(files, tgt):
    out, err = _run_command("/usr/bin/hadoop distcp %s %s" % (files, tgt), True)
    if err:
        print(err)


def _local_to_hdfs(src, tgt):
    if tgt[0] != '/':
        tgt = '/' + tgt
    out, err1 = _run_command("/usr/bin/hdfs dfs -mkdir -p %s" % tgt)
    if err1:
        print(err1)
        return
    out, err2 = _run_command("/usr/bin/hdfs dfs -cp %s %s" % ("file://" + os.path.join(src, '*'),
                                                              tgt))
    if err2:
        print(err2)


class S3Helper:
    """A helper function to access S3 files"""
    def __init__(self):
        self.conn = None
        self.bucket_name = None
        self.bucket = None

    @staticmethod
    def help():
        print('''
        s3helper is a helper object to move files and directory between
        local filesystem, AWS S3 and local HDFS.

        Usage:

        1. Open a S3 bucket under your account
            s3helper.open_bucket(<bucket_name>)
        2. List all files under the opened S3 bucket
            s3helper.ls() or s3helper.ls_s3()
        Or optionally,
            s3helper.ls(<file_path>) or s3helper.ls_s3(<file_path>)
        3. List all files on HDFS
            s3helper.ls_hdfs()
        Or optionally,
            s3helper.ls_hdfs(<file_path>)
        where <file_path> is an absolute path in the opened S3 bucket.

        Now you can access your S3 files.

        1. Transfer files between S3 and HDFS
          a. To download all S3 files under a directory to HDFS, please call
                s3helper.s3_to_hdfs(<s3_directory_path>, <HDFS_directory_path>)
          b. To upload a directory on HDFS to S3, please call
                s3helper.hdfs_to_s3(<HDFS_directory_path>, <s3_directory_path>)

        2. Transfer files between S3 and local filesystem (not HDFS)
          a. To download one single S3 file to local filesystem, please call
                s3helper.s3_to_local(<s3_file_path>, <local_file_path>)
          b. To upload a file on local filesystem to S3, please call
                s3helper.local_to_s3(<local_file_path>, <s3_directory_path>)

        3. Transfer files between local filesystem and HDFS
          a. To upload a directory on local filesystem to HDFS, please call
                s3helper.local_to_hdfs(<local_dir_path>, <HDFS_dir_path>)

        4. Get S3 file paths without data transfer
          a. To get the URLs of S3 files under a directory, please call
                s3helper.get_path(<s3_directory_path>)
          Note this method do nothing on your local HDFS.
        ''')

    def open_bucket(self, bucket_name, region=None):
        """Open a S3 bucket.

            Args:
                bucket_name
            Returns:
                None
        """
        if bucket_name.startswith('s3n://') or bucket_name.startswith('s3://'):
            raise ValueError('bucket_name must NOT contain any prefix '
                             '(e.g. s3:// or s3n://)')

        while bucket_name[-1] == '/':
            bucket_name = bucket_name[:-1]
        if region is None:
            print("Warning: S3 region is not defined. Default region is set to 'us-east-1'.")
            region = "us-east-1"
        self.bucket_name = bucket_name
        self.conn = S3Connection(host="s3.{}.amazonaws.com".format(region))
        try:
            self.bucket = self.conn.get_bucket(self.bucket_name)
        except S3ResponseError as e:
            self.bucket = None
            self.bucket_name = None
            print('Open S3 bucket "%s" failed.\n' % bucket_name + str(e))
            print(e.message)

    def ls(self, path=''):
        """same as ls_s3"""
        return self.ls_s3(path)

    def ls_s3(self, path=''):
        """List all files in `path` on S3.

            Args:
                path
            Returns:
                an array of files in `path`
        """
        if not self.bucket:
            raise Exception('No bucket is opened. '
                            'Please use open_bucket method first.')

        path = path.strip()
        if len(path) and path[0] == '/':
            path = path[1:]
        files = self.bucket.list(prefix=path)

        if path == '':
            k = 1
        else:
            k = len(path.split('/')) + 1
        return sorted(list(set(
            ['/'.join(t.key.split('/')[:k]) for t in files])))

    @staticmethod
    def ls_hdfs(path='/'):
        """List all files in `path` on HDFS."""
        if not path or path[0] != '/':
            path = '/' + path
        return _list_hdfs(path)

    def get_path(self, path=''):
        """Get paths of all files in `path` with s3 prefix,
           which can be passed to Spark.

            Args:
                path
            Returns:
                an array of file paths with s3 prefix
        """
        if not self.bucket:
            raise Exception('no bucket is opened.')

        path = path.strip()
        if len(path) and path[0] == '/':
            path = path[1:]
        files = self.bucket.list(prefix=path)
        prefix = "s3n://%s/" % self.bucket_name
        return [prefix + t.key for t in files]

    def s3_to_hdfs(self, src, tgt):
        """Load all files in `src` to the directory `tgt` in HDFS.

            Args:
                src, tgt
            Returns:
                an array of file paths in HDFS
        """
        if not self.bucket:
            raise Exception('no bucket is opened.')

        src, tgt = src.strip(), tgt.strip()
        if len(src) and src[0] == '/':
            src = src[1:]
        if tgt == '' or (tgt[0] != '/' and not tgt.startswith('hdfs://')):
            tgt = '/' + tgt
        if tgt[-1] != '/':
            tgt = tgt + '/'
        files = self.bucket.list(prefix=src)
        prefix = "s3n://%s/" % self.bucket_name
        _s3_to_hdfs(' '.join([prefix + t.key for t in files]), tgt)
        self.ls_hdfs(tgt)

    def hdfs_to_s3(self, src, tgt):
        """Upload a directory `src` on HDFS to a directory `tgt` on S3.

           Args:
                src, tgt
           Returns:
                file list of the `tgt` directory on S3 after uploading
        """
        if not self.bucket:
            raise Exception('no bucket is opened. '
                            'See help() method for more info')

        src, tgt = src.strip(), tgt.strip()
        if src == '' or (src[0] != '/' and not src.startswith('hdfs://')):
            src = '/' + src
        if src[-1] != '/':
            src = src + '/'
        if len(tgt) and tgt[0] == '/':
            tgt = tgt[1:]
        tgt = "s3n://%s/" % self.bucket_name + tgt
        print("*NOTE*\n"
              "This method will create a MapReudce job to upload the content "
              "in HDFS to S3. The process may take a while.\n\n")
        _hdfs_to_s3(src, tgt)

    def local_to_s3(self, filename, tgt):
        """Save a local file `filename` to the directory `tgt` on S3.

            Args:
                filename, tgt
            Returns:
                None
        """
        if not self.bucket:
            raise Exception('no bucket is opened.')
        if not os.path.exists(filename):
            raise Exception("File does not exist.")
        if os.path.isdir(filename):
            raise Exception(
                "Transfer between S3 and local filesystem "
                "does not support directory.")

        tgt = tgt.strip()
        if len(tgt) and tgt[0] == '/':
            tgt = tgt[1:]
        if not tgt:
            tgt = filename.rsplit('/', 1)[-1]
        k = self.bucket.new_key(tgt)
        k.set_contents_from_filename(filename)

    def s3_to_local(self, src, tgt):
        """Download the remote file `key_name` on S3 to local.

            Args:
                src, tgt
            Returns:
                None
        """
        if not self.bucket:
            raise Exception('no bucket is opened.')

        key = src.strip()
        if key[0] == '/':
            key = key[1:]
        k = self.bucket.get_key(key)
        if not k:
            raise Exception(
                "File " + src + " doesn't exist.\n"
                "Note that the transfer between S3 and local filesystem "
                "do not support directory.")
        k.get_contents_to_filename(tgt)

    @staticmethod
    def local_to_hdfs(src, tgt):
        """Upload local directory to HDFS.

           Args:
               src - path to the local directory,
               tgt - path to the HDFS directory
           Returns:
               None
        """
        if src[0] != '/' or tgt[0] != '/':
            raise Exception("The directory path cannot be an relative path.")
        _local_to_hdfs(src, tgt)


s3helper = S3Helper()

In [9]:
s3helper.help()


        s3helper is a helper object to move files and directory between
        local filesystem, AWS S3 and local HDFS.

        Usage:

        1. Open a S3 bucket under your account
            s3helper.open_bucket(<bucket_name>)
        2. List all files under the opened S3 bucket
            s3helper.ls() or s3helper.ls_s3()
        Or optionally,
            s3helper.ls(<file_path>) or s3helper.ls_s3(<file_path>)
        3. List all files on HDFS
            s3helper.ls_hdfs()
        Or optionally,
            s3helper.ls_hdfs(<file_path>)
        where <file_path> is an absolute path in the opened S3 bucket.

        Now you can access your S3 files.

        1. Transfer files between S3 and HDFS
          a. To download all S3 files under a directory to HDFS, please call
                s3helper.s3_to_hdfs(<s3_directory_path>, <HDFS_directory_path>)
          b. To upload a directory on HDFS to S3, please call
                s3helper.hdfs_to_s3(<HDFS_directory_path>, <s3_dir

## (2) Open the bucket that has your files.

In [20]:
s3helper.open_bucket('dse-weather')
s3helper.ls()



[u'256_STAT',
 u'ALL.csv.gz',
 u'ALLBootstrap.sh',
 u'MasterBootstrap.sh',
 u'NY.parquet',
 u'PrivateBootstrap.sh',
 u'RunFromTerminal.sh',
 u'US_Weather_with_smoothed.parquet',
 u'US_Weather_with_smoothed.parquet_$folder$',
 u'US_stations.parquet',
 u'US_weather.parquet',
 u'info',
 u's3hook.sh',
 u'weather.parquet']

In [23]:
s3helper.open_bucket('dse-weather-west-2', region="us-west-2")
s3helper.ls('info/')

[u'info/US_stations.tsv.gz',
 u'info/all_stations.tsv.gz',
 u'info/stations.parquet/_SUCCESS',
 u'info/stations.parquet/_common_metadata',
 u'info/stations.parquet/_metadata',
 u'info/stations.parquet/part-r-00000-1fd04699-91d3-4a2a-9b36-e25c9c5f0376.gz.parquet',
 u'info/stations.parquet/part-r-00001-1fd04699-91d3-4a2a-9b36-e25c9c5f0376.gz.parquet']

## (3) List files in the S3 bucket and HDFS.

In [12]:
print("\n".join(s3helper.ls_s3()))  # By default, list all files in the root directory of the bucket
print(s3helper.ls_s3('fromHDFS'))

print(s3helper.ls_hdfs())

.aws
.bash_profile
.bashrc
.conda
.ipython
.jupyter
.local
.ssh
ALL.csv.gz
ALLBootstrap.sh
MasterBootstrap.sh
NY.parquet
PrivateBootstrap.sh
RunFromTerminal.sh
US_Weather_with_smoothed.parquet
US_Weather_with_smoothed.parquet_$folder$
US_stations.parquet
US_weather.parquet
fromLocal
info
s3helper.py
s3hook.sh
weather.parquet
[]
Found 3 items
drwxrwxrwt   - hdfs hadoop          0 2020-04-28 00:01 /tmp
drwxr-xr-x   - hdfs hadoop          0 2020-04-27 19:07 /user
drwxr-xr-x   - hdfs hadoop          0 2020-04-27 19:07 /var

None


## (4) Move files around local filesystem, HDFS and S3

As described in `s3helper.help()`, there are five methods for file transfers:

1. `s3helper.s3_to_hdfs(<s3_directory_path>, <HDFS_directory_path>)`
2. `s3helper.hdfs_to_s3(<HDFS_directory_path>, <s3_directory_path>)`
3. `s3helper.s3_to_local(<s3_file_path>, <local_file_path>)`
4. `s3helper.local_to_s3(<local_file_path>, <s3_directory_path>)`
5. `s3helper.local_to_hdfs(<local_dir_path>, <HDFS_dir_path>)`

In [13]:
!ls -l /home/hadoop/s3helper.py

-rw-rw-r-- 1 hadoop hadoop 9409 Apr 27 18:58 /home/hadoop/s3helper.py


In [14]:
s3helper.local_to_s3("/home/hadoop/s3helper.py", "fromLocal/s3helper.py")

In [15]:
print(s3helper.ls_s3("fromLocal"))

[u'fromLocal/s3helper.py']


In [16]:
%%time
s3helper.s3_to_hdfs("weather.parquet", "/tmp/weather.parquet")

20/04/28 00:18:12 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/weather.parquet/_SUCCESS' for reading
cp: `/tmp/weather.parquet/_SUCCESS': File exists
20/04/28 00:18:12 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/weather.parquet/part-00000-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet' for reading
cp: `/tmp/weather.parquet/part-00000-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet': File exists
20/04/28 00:18:12 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/weather.parquet/part-00001-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet' for reading
cp: `/tmp/weather.parquet/part-00001-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet': File exists
20/04/28 00:18:12 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/weather.parquet/part-00002-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet' for reading
cp: `/tmp/weather.parquet/part-00002-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.sna

Found 59 items
-rw-r--r--   2 hadoop hadoop          0 2020-04-28 00:01 /tmp/weather.parquet/_SUCCESS
-rw-r--r--   2 hadoop hadoop   40670401 2020-04-28 00:01 /tmp/weather.parquet/part-00000-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop hadoop   40307528 2020-04-28 00:01 /tmp/weather.parquet/part-00001-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop hadoop   40016618 2020-04-28 00:01 /tmp/weather.parquet/part-00002-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop hadoop   40377232 2020-04-28 00:01 /tmp/weather.parquet/part-00003-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop hadoop   40119938 2020-04-28 00:01 /tmp/weather.parquet/part-00004-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop hadoop   40278884 2020-04-28 00:01 /tmp/weather.parquet/part-00005-6cb19187-62a0-42ad-9516-e03e05ea0c40-c000.snappy.parquet
-rw-r--r--   2 hadoop 

In [18]:
%%bash
mkdir /home/hadoop/fromS3
mkdir /home/hadoop/fromHDFS

In [24]:
%%time 
s3helper.s3_to_hdfs("info/stations.parquet", "/tmp/stations.parquet")

20/04/28 00:24:44 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/info/stations.parquet/_SUCCESS' for reading
20/04/28 00:24:44 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/info/stations.parquet/_common_metadata' for reading
20/04/28 00:24:44 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/info/stations.parquet/_metadata' for reading
20/04/28 00:24:44 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/info/stations.parquet/part-r-00000-1fd04699-91d3-4a2a-9b36-e25c9c5f0376.gz.parquet' for reading
20/04/28 00:24:44 INFO s3n.S3NativeFileSystem: Opening 's3n://dse-weather-west-2/info/stations.parquet/part-r-00001-1fd04699-91d3-4a2a-9b36-e25c9c5f0376.gz.parquet' for reading

Found 5 items
-rw-r--r--   2 hadoop hadoop          0 2020-04-28 00:24 /tmp/stations.parquet/_SUCCESS
-rw-r--r--   2 hadoop hadoop        894 2020-04-28 00:24 /tmp/stations.parquet/_common_metadata
-rw-r--r--   2 hadoop hadoop       3107 2020-04-28 00:24 /tmp/s

In [25]:
print(s3helper.ls_hdfs("/tmp"))

Found 3 items
drwxrwxrwx   - mapred mapred          0 2020-04-27 19:07 /tmp/hadoop-yarn
drwxr-xr-x   - hadoop hadoop          0 2020-04-28 00:24 /tmp/stations.parquet
drwxr-xr-x   - hadoop hadoop          0 2020-04-28 00:02 /tmp/weather.parquet

None


# Parquet Files

To get a reasonable reading speed, please always load parquet files from S3 to HDFS before accessing them.

In [28]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [30]:
%%time
df = sqlContext.sql("SELECT * FROM parquet.`/tmp/weather.parquet`")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.4 s


In [33]:
%%time
stations=sqlContext.sql("SELECT * FROM parquet.`/tmp/stations.parquet`")

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.67 s


In [31]:
df.count()

9358394

In [32]:
df.show()

+-----------+-----------+----+--------------------+
|    Station|Measurement|Year|              Values|
+-----------+-----------+----+--------------------+
|USW00093819|       WDFM|1972|[38 58 EC 5C 38 5...|
|USW00093819|       WDFM|1973|[38 5C EC 5C 08 5...|
|USW00093819|       WDFM|1974|[EC 5C A0 51 EC 5...|
|USW00093819|       WDFM|1975|[EC 5C 38 58 38 5...|
|USW00093819|       WDFM|1976|[38 58 38 58 38 5...|
|USW00093819|       WDFM|1977|[EC 5C 38 5C A0 5...|
|USW00093819|       WDFM|1978|[38 5C 08 5B EC 5...|
|USW00093819|       WDFM|1979|[EC 5C 38 5C 08 5...|
|USW00093819|       WESD|1952|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1953|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1954|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1955|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1956|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1957|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1958|[00 7E 00 7E 00 7...|
|USW00093819|       WESD|1959|[00 7E 00 7E 00 7...|
|USW00093819

### Example

In [34]:
stations.show()

+-----------+--------+---------+---------+-----+-------------+-------+-------+-----+
|         ID|latitude|longitude|elevation|state|         name|GSNFLAG|HCNFLAG|WMOID|
+-----------+--------+---------+---------+-----+-------------+-------+-------+-----+
|US1COLR0185|  40.711|-105.1144|   1599.0|   CO|WELLINGTON 5.|       |       |     |
|US1COLR0186| 40.8135|-105.0963|   1601.7|   CO|BUCKEYE 0.9 S|       |       |     |
|US1COLR0187| 40.7689| -105.064|   1653.8|   CO|WELLINGTON 5.|       |       |     |
|US1COLR0189|  40.689|-105.0242|   1594.1|   CO|WELLINGTON 1.|       |       |     |
|US1COLR0193| 40.6711|-105.0639|   1584.0|   CO|WELLINGTON 3.|       |       |     |
|US1COLR0196|  40.691|-105.0157|   1581.9|   CO|WELLINGTON 0.|       |       |     |
|US1COLR0197|  40.625|-105.3403|   2464.0|   CO|    BLV 8.0 W|       |       |     |
|US1COLR0200| 40.3345|-105.5127|   2431.1|   CO|ESTES PARK 2.|       |       |     |
|US1COLR0201| 40.4701|-105.4493|   2303.1|   CO|GLEN HAVEN 1.|   

In [35]:
st_names=stations.select('ID').collect()

In [36]:
len(st_names)

85284

In [38]:
st_names1=[r.ID for r in st_names]

In [40]:
st_names1[-5:]

[u'US1COLR0171',
 u'US1COLR0172',
 u'US1COLR0173',
 u'US1COLR0181',
 u'US1COLR0183']

In [41]:
from collections import Counter

In [47]:
C=Counter([x[:3] for x in st_names1])

In [51]:
sorted(C.items(),key=lambda x:x[1],reverse=True)

[(u'USC', 22144),
 (u'US1', 20054),
 (u'ASN', 17081),
 (u'BR0', 5934),
 (u'CA0', 5255),
 (u'IN0', 3805),
 (u'USW', 1731),
 (u'USR', 1454),
 (u'SF0', 1154),
 (u'RSM', 1096),
 (u'USS', 668),
 (u'NOE', 399),
 (u'NLE', 353),
 (u'KZ0', 328),
 (u'WA0', 281),
 (u'MX0', 220),
 (u'UPM', 204),
 (u'CH0', 198),
 (u'JA0', 154),
 (u'RQC', 148),
 (u'UY0', 146),
 (u'GG0', 102),
 (u'SPE', 99),
 (u'VE0', 81),
 (u'UZM', 78),
 (u'KG0', 73),
 (u'GME', 69),
 (u'AJ0', 66),
 (u'TI0', 62),
 (u'TX0', 57),
 (u'AM0', 52),
 (u'BOM', 51),
 (u'KSW', 45),
 (u'TH0', 45),
 (u'FRE', 44),
 (u'VMW', 42),
 (u'GMW', 40),
 (u'AR0', 40),
 (u'SWE', 40),
 (u'JAW', 39),
 (u'VQC', 37),
 (u'LG0', 32),
 (u'FMC', 29),
 (u'CA1', 28),
 (u'EN0', 25),
 (u'ROE', 25),
 (u'UKW', 24),
 (u'LH0', 22),
 (u'GM0', 22),
 (u'ZI0', 20),
 (u'NO0', 20),
 (u'BC0', 20),
 (u'SIE', 19),
 (u'MI0', 19),
 (u'AQC', 19),
 (u'ZA0', 18),
 (u'MD0', 18),
 (u'ET0', 18),
 (u'SW0', 18),
 (u'MR0', 17),
 (u'ID0', 17),
 (u'SU0', 16),
 (u'GB0', 16),
 (u'SP0', 16),
 (u'G