## Move csv file from S3 to the head node and the to HDFS

In [16]:
#create directory to hold data one node
!mkdir Weather
%cd Weather/

In [18]:
#copy file from S3
!aws s3 cp s3://dse-weather/ALL.csv.gz ./ALL.csv.gz

download: s3://dse-weather/ALL.csv.gz to ./ALL.csv.gz              


In [19]:
#unompress file
!gunzip ALL.csv.gz

In [20]:
!ls -l

total 7489152
-rw-rw-r-- 1 hadoop hadoop 7668890105 Feb  9  2016 ALL.csv


In [22]:
!head -2 ALL.csv

station,year,measurement,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271

In [27]:
!hadoop fs -mkdir /weather

In [28]:
#create a data directory on hdfs
!hadoop fs -copyFromLocal ALL.csv /weather/weather.csv

In [29]:
!hadoop fs -ls /weather

Found 1 items
-rw-r--r--   2 hadoop hadoop 7668890105 2018-02-15 21:21 /weather/weather.csv


### Read csv file into an RDD

In [1]:
from pyspark import SparkContext

sc = SparkContext(pyFiles=['lib/numpy_pack.py'])

In [2]:
RDD=sc.textFile('hdfs://ec2-52-205-129-57.compute-1.amazonaws.com/weather/weather.csv')

### Code for packing and unpacking byte arrays

In [123]:
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())


def unpackArray(x,data_type=np.float16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

In [7]:
lines=RDD.take(10) # used to debug the code

In [4]:
RDD.count()

9358395

### range values
Using code that was removed we find that the range of values is 

`-1000.0, 97892.0` 

which means that as ints we will need 32 but, but with float we can use just 16.

In [170]:
#main parsing code

import numpy as np
def parse_weather(line):
    L=line.split(',')
    try:
        assert len(L)==368
        i=2
        L[i]=int(L[i])
        for i in range(3,368):
            if L[i]!='':
                L[i]=np.float16(L[i])
            else:
                L[i]=np.nan
    except:
        return None
        #print 'error in position',i, ':', line
    Out=L[:3]
    Out.append(packArray(np.array(L[3:],dtype=np.float16)))
    return Out

In [119]:
lines=RDD.take(10)

In [121]:
GG=parse_weather(lines[-2])

In [142]:
DATA=RDD.map(parse_weather).filter(lambda x:x!=None) # fileter out bad rows which are mapped to None

In [143]:
DATA.count()
#all lines: 9358395
# only the first line (the header) is bad.
# Good lines: 9358394

9358394

In [144]:
DATA.take(1)

[[u'ASN00054128',
  u'DAPR',
  1969,
  bytearray(b'\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00

## Transform RDD into a Spark DataFrame

In [132]:
import os
import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType, BinaryType, FloatType

# Just like using Spark requires having a SparkContext, using SQL requires an SQLContext
sqlContext = SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f44e41cf690>

In [171]:
### Defining the Schema explicitly
# The advantage of creating a DataFrame using a pre-defined schema allows the content of the RDD to be simple tuples, rather than rows.

# In this case we create the dataframe from an RDD of tuples (rather than Rows) and provide the schema explicitly
# Schema with two fields - person_name and person_age
schema = StructType([StructField("Station",     StringType(), True),
                     StructField("Measurement", StringType(), True),
                     StructField("Year",        IntegerType(),True),
                     StructField("Values",      BinaryType(),True)
                    ])
schema

StructType(List(StructField(Station,StringType,true),StructField(Measurement,StringType,true),StructField(Year,IntegerType,true),StructField(Values,BinaryType,true)))

In [172]:
# Create a DataFrame by applying the schema to the RDD and print the schema
ALL_DataFrame = sqlContext.createDataFrame(DATA, schema)
ALL_DataFrame.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Measurement: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Values: binary (nullable = true)



### Write out data frame into Parquet directory

In [175]:
%%time
!hadoop fs -rm -r /weather/weather.parquet

18/02/17 05:26:48 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /weather/weather.parquet
CPU times: user 44 ms, sys: 20 ms, total: 64 ms
Wall time: 1.93 s


In [176]:
%%time 
outfilename="hdfs://ec2-52-205-129-57.compute-1.amazonaws.com/weather/weather.parquet"
ALL_DataFrame.write.save(outfilename)

CPU times: user 16 ms, sys: 8 ms, total: 24 ms
Wall time: 3min 42s


In [177]:
!hadoop fs -ls /weather/

Found 2 items
-rw-r--r--   2 hadoop hadoop 7668890105 2018-02-15 21:21 /weather/weather.csv
drwxr-xr-x   - hadoop hadoop          0 2018-02-17 05:30 /weather/weather.parquet


### Copy parquet directory to head node and then to S3

In [179]:
%cd /mnt/workspace/Weather/
!rm -rf weather.parquet/
!ls -lrt

/mnt/workspace/Weather
total 7489152
-rw-rw-r-- 1 hadoop hadoop 7668890105 Feb  9  2016 ALL.csv


In [180]:
%%time
!hadoop fs -copyToLocal /weather/weather.parquet weather.parquet

CPU times: user 124 ms, sys: 52 ms, total: 176 ms
Wall time: 6.26 s


In [181]:
!du *

7489152	ALL.csv
2248960	weather.parquet


In [183]:
%%time
#copy file from S3
!aws s3 rm --recursive s3://dse-weather/weather.parquet
!aws s3 cp --recursive ./weather.parquet s3://dse-weather/weather.parquet

delete: s3://dse-weather/weather.parquet/_SUCCESS
delete: s3://dse-weather/weather.parquet/part-00000-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00001-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00006-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00004-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00002-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00003-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00009-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00007-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://dse-weather/weather.parquet/part-00010-bfceb3b0-dbb6-44da-bf98-1276f10339f0.snappy.parquet
delete: s3://d

upload: weather.parquet/part-00011-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00011-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00014-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00014-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00012-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00012-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00002-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00002-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00015-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00015-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00017-48cf40d4-1835-45ac-82b0-af26cdba417

upload: weather.parquet/part-00056-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00056-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00051-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00051-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
upload: weather.parquet/part-00055-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet to s3://dse-weather/weather.parquet/part-00055-48cf40d4-1835-45ac-82b0-af26cdba417c.snappy.parquet
CPU times: user 1.64 s, sys: 396 ms, total: 2.04 s
Wall time: 19.2 s
