## Load from CVS
The code in this notebook copies the cvs files for beaked whales from S3 to HDFS and then loads the data into 
a spark DataFrame.


In [17]:
sc.stop()

In [19]:
from pyspark import SparkContext
sc = SparkContext(master=master_url)

from pyspark.sql import Row, SQLContext,DataFrame
from pyspark.sql.types import *

sqlContext = SQLContext(sc)

%pylab inline

!pip install pandas
!pip install scipy

import pandas as pd
import datetime as dt

from scipy.io import loadmat,savemat,whosmat

from string import split
from collections import Counter
import re
import numpy as np
from numpy import shape

from glob import glob

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"




### Format of cvs files
|field name     | Description               | Data Type
|---------------|---------------------------|--------------
|0: time        | time of click             | string in datetime format `%Y-%m-%d %H:%M:%S.%f`
|1: species		| Initial species classification	        | 'str'
|2: site		| name of site		        | 'str'
|3: rec_no		| recording number		    | 'str'
|4: bout_i		| bout number		        | numpy.int64
|5: peak2peak	| peak to peak magnitude    | 			numpy.float64
|6: MSN	        |	wave form |		 an array of length 202
|208: MSP		|	spectra |	 an array of length 101  
|309: TPWS1		| 1 if click appears in TPWS1	| 	bool
|310: MD1		|	--- " ---	in MD1|	bool
|311: FD1	    |	--- " ---	in FD1|	bool
|312: TPWS2		| 1 if click appears in TPWS2	| 	bool
|313: MD2		|	--- " ---	in MD2|	bool
|314: FD2	    |	--- " ---	in FD2|	bool
|315: TPWS3		| 1 if click appears in TPWS3	| 	bool
|316: MD3		|	--- " ---	in MD3|	bool
|317: FD3	    |	--- " ---	in FD3|	bool
total number of fields= 318


### check  S3 contents

In [None]:
%cd /root/ipython/BeakedWhaleClassification/

In [None]:
#remember to set credentials under "setup S3" in the cluster setup page
s3helper.open_bucket('while-classification')
s3helper.ls_s3()

In [None]:
dirs=s3helper.ls_s3('CVS')
dirs[:10]

In [None]:
from time import time

### Copy from S3 to HDFS

In [None]:
t1=time()
s3helper.s3_to_hdfs('CVS', 'CVS')
time()-t1

### Read data into dataframe

In [24]:
import sys
sys.path.append('lib')
from row_parser import *

Parse_rules,field_names,RowObject = init_parser_parameters()

from pyspark.sql import DataFrame

CVS_Data=sc.textFile("/CVS/")
row=CVS_Data.first()
print row

def parse(row):
    items=row.split(',')
    D=[]
    for pr in Parse_rules:
        start=pr['start']
        end=pr['end']
        parser=pr['parser']
        if end-start==1:
            D.append(parser(items[start]))
        else:
            D.append(parser(items[start:end]))
    return RowObject(*D)

#parse(row)

RDD=CVS_Data.map(parse)
# RDD.take(3)

df=sqlContext.createDataFrame(RDD)
df.show()

('time', 'datetime') {'start': 0, 'parser': <function parse_date at 0x7f8945c98b90>, 'end': 1, 'name': 'time'}
('species', 'str') {'start': 1, 'parser': <function parse_string at 0x7f8945972938>, 'end': 2, 'name': 'species'}
('site', 'str') {'start': 2, 'parser': <function parse_string at 0x7f8945972938>, 'end': 3, 'name': 'site'}
('rec_no', 'str') {'start': 3, 'parser': <function parse_string at 0x7f8945972938>, 'end': 4, 'name': 'rec_no'}
('bout_i', 'int') {'start': 4, 'parser': <function parse_int at 0x7f8945972aa0>, 'end': 5, 'name': 'bout_i'}
('peak2peak', 'float') {'start': 5, 'parser': <function parse_float at 0x7f8945972c80>, 'end': 6, 'name': 'peak2peak'}
('MSN', 'array', 202) {'start': 6, 'parser': <function parse_array at 0x7f89813686e0>, 'end': 208, 'name': 'MSN'}
('MSP', 'array', 101) {'start': 208, 'parser': <function parse_array at 0x7f89813686e0>, 'end': 309, 'name': 'MSP'}
('TPWS1', 'bool') {'start': 309, 'parser': <function parse_int at 0x7f8945972aa0>, 'end': 310, 'n

AttributeError: 'NoneType' object has no attribute 'sc'

In [None]:
t0=time()
print df.cache().count()
print time()-t0

t0=time()
print df.count()
time()-t0

In [23]:
sc.stop()