In [7]:
"""
- Dataframe : Un sous objet RDD restreint (les types sont restreint => optimisation)
    => 2 D : lignes / colonne
        -> chaque colonne à un type définit
        -> chaque ligne contient un enregistrement
    => similaire aux dataframes de R ou pandas
"""
# nombre de partition (cloisons) par default dans le SC = Le nombre de Workers
from time import time
from pyspark import SparkContext
# spark context avec 4 workers
sc = SparkContext(master="local[4]")

#import sql context
from pyspark import Row
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
# dataframes
from pyspark.sql import SQLContext
from pyspark.sql.types import *

In [34]:
"""
- Creation of a dataframe using the Row object
"""
# creation of a rdd of elements of type row : 
some_rdd = sc.parallelize([Row(name=u"John", age=28),
                            Row(name=u"Dwayne", age=36),
                            Row(name=u"Adam", age=21),
                            Row(name=u"Andree", age=40)])

some_rdd.collect()

[Row(age=28, name='John'),
 Row(age=36, name='Dwayne'),
 Row(age=21, name='Adam'),
 Row(age=40, name='Andree')]

In [35]:
#transform into RDD :
some_df = sqlContext.createDataFrame(some_rdd)
#schema : name and type of each columns
some_df.printSchema()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [36]:
# collecting a RDD and a DataFrames gives the same output:
print('Type of RDD : ',type(some_rdd),'\n Type of DataFrame : ', type(some_df))
print('RDD : ',some_rdd.collect())
print('RDD : ',some_df.collect())

Type of RDD :  <class 'pyspark.rdd.RDD'> 
 Type of DataFrame :  <class 'pyspark.sql.dataframe.DataFrame'>
RDD :  [Row(age=28, name='John'), Row(age=36, name='Dwayne'), Row(age=21, name='Adam'), Row(age=40, name='Andree')]
RDD :  [Row(age=28, name='John'), Row(age=36, name='Dwayne'), Row(age=21, name='Adam'), Row(age=40, name='Andree')]


In [61]:
"""
- Creation of a dataframe using a schema and using t-uples RDD
"""

rdd0 = sc.parallelize([('Pedro',24),('Hernesto',41),('Maria',29),('Javier',33),('Alan',49),('Luciana',21),])
# definition of the schema : => StrucField(nom, type, nullable)
schema = StructType([StructField('personName',StringType(),False),
                  StructField('personAge',IntegerType(),False)])
#creation of the DataFrame :
df0 = sqlContext.createDataFrame(rdd0,schema)
df0.printSchema()

# writing DF into parquet file :
df0.write.parquet('data/people.parquet')

root
 |-- personName: string (nullable = false)
 |-- personAge: integer (nullable = false)



In [62]:
"""
- Loading dataframes from Disk :
    => Parquet
    => JSON
    => CSV
"""
parquetFileDir = 'data/people.parquet'
df1 = sqlContext.read.load(parquetFileDir)
df1.show()
# show df using select method
df1_select = df1.select('personName','personAge')
df1_select.show()
df1_select.write.parquet('data/people2.parquet')

+----------+---------+
|personName|personAge|
+----------+---------+
|  Hernesto|       41|
|     Maria|       29|
|      Alan|       49|
|   Luciana|       21|
|    Javier|       33|
|     Pedro|       24|
+----------+---------+

+----------+---------+
|personName|personAge|
+----------+---------+
|  Hernesto|       41|
|     Maria|       29|
|      Alan|       49|
|   Luciana|       21|
|    Javier|       33|
|     Pedro|       24|
+----------+---------+



In [4]:
"""
- Meteorological dataframe :
    =>
"""
from os.path import split,join,exists
from os import mkdir,getcwd,remove
from glob import glob

def makedir(rep):
    """
    Making a directory
    Arguments : 
        => directory (type : String)
    """
    if exists(rep):
        print('directory',rep,'already exists')
    else:
        print('making',rep)
        mkdir(rep)

# create directory if needed

notebook_dir=getcwd()
print(notebook_dir)
data_dir=join(notebook_dir,'data')
print(data_dir)
weather_dir=join(data_dir,'Weather')
print(weather_dir)
#creation repertoire weather
makedir(weather_dir)

file_index='NY'
zip_file='%s.tgz'%(file_index) #the .csv extension is a mistake, this is a pickle file, not a csv file.
old_files='%s/%s*'%(weather_dir,zip_file[:-3])
for f in glob(old_files):
    print('removing',f)
    !rm -rf {f}

command="wget https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s -P %s "%(zip_file, weather_dir)
print(command)
!$command
!ls -lh $weather_dir/$zip_file


/home/ucsddse230/work/PA1_collinear_points/pa1/PySparkNoteBook
/home/ucsddse230/work/PA1_collinear_points/Data
making /home/ucsddse230/work/PA1_collinear_points/Data
/home/ucsddse230/work/PA1_collinear_points/Data/Weather
making /home/ucsddse230/work/PA1_collinear_points/Data/Weather
wget https://mas-dse-open.s3.amazonaws.com/Weather/by_state/NY.tgz -P /home/ucsddse230/work/PA1_collinear_points/Data/Weather 
--2019-10-28 16:39:19--  https://mas-dse-open.s3.amazonaws.com/Weather/by_state/NY.tgz
Resolving mas-dse-open.s3.amazonaws.com (mas-dse-open.s3.amazonaws.com)... 52.218.236.3
Connecting to mas-dse-open.s3.amazonaws.com (mas-dse-open.s3.amazonaws.com)|52.218.236.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66288146 (63M) [application/x-tar]
Saving to: ‘/home/ucsddse230/work/PA1_collinear_points/Data/Weather/NY.tgz’


2019-10-28 16:39:32 (5.80 MB/s) - ‘/home/ucsddse230/work/PA1_collinear_points/Data/Weather/NY.tgz’ saved [66288146/66288146]

-rwxr-xr-x 

"\n#trouver leurs fichier meteo:\ndf_meteo = ...\n#\ndf_meteo.select('station','year','measurment').show(5)\n#save pf_meteo dans un fichier parquet : \ndata_dir = 'data'\nfile_index = '1'\nfilename = '%s/US_weather_%s.parquet'%(data_dir,file_index)\n###!rm -rf $filename\ndf_meteo.write.save(filename)\n"

In [5]:
#extracting the parquet file
!tar zxvf {weather_dir}/{zip_file} -C {weather_dir}



NY.parquet/
NY.parquet/_SUCCESS
NY.parquet/part-00022-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00000-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00021-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00001-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00023-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00002-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00024-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00003-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00025-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00004-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00027-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-00005-89caf7c0-9733-40ec-a650-7f368529dd01-c000.snappy.parquet
NY.parquet/part-0000

In [9]:
weather_parquet = join(weather_dir, zip_file[:-3]+'parquet')
print(weather_parquet)
df = sqlContext.read.load(weather_parquet)

df.show(1)


/home/ucsddse230/work/PA1_collinear_points/Data/Weather/NY.parquet
+-----------+-----------+----+--------------------+-----------------+--------------+------------------+-----------------+-----+-----------------+
|    Station|Measurement|Year|              Values|       dist_coast|      latitude|         longitude|        elevation|state|             name|
+-----------+-----------+----+--------------------+-----------------+--------------+------------------+-----------------+-----+-----------------+
|USW00094704|   PRCP_s20|1945|[00 00 00 00 00 0...|361.8320007324219|42.57080078125|-77.71330261230469|208.8000030517578|   NY|DANSVILLE MUNI AP|
+-----------+-----------+----+--------------------+-----------------+--------------+------------------+-----------------+-----+-----------------+
only showing top 1 row



"\n#trouver leurs fichier meteo:\ndf_meteo = ...\n#\ndf_meteo.select('station','year','measurment').show(5)\n#save pf_meteo dans un fichier parquet : \ndata_dir = 'data'\nfile_index = '1'\nfilename = '%s/US_weather_%s.parquet'%(data_dir,file_index)\n###!rm -rf $filename\ndf_meteo.write.save(filename)\n"

In [15]:

df.select('Station','Year','Measurement').show(5)
print(file_index)

#save pf_meteo dans un fichier parquet : 

filename = '%s/US_weather_%s.parquet'%(data_dir,file_index)
#remove file (avoiding fucker overwrite)
!rm -rf $filename
df.write.save(filename)
print('Fichier écrit : ',data_dir,filename,file_index)


+-----------+----+-----------+
|    Station|Year|Measurement|
+-----------+----+-----------+
|USW00094704|1945|   PRCP_s20|
|USW00094704|1946|   PRCP_s20|
|USW00094704|1947|   PRCP_s20|
|USW00094704|1948|   PRCP_s20|
|USW00094704|1949|   PRCP_s20|
+-----------+----+-----------+
only showing top 5 rows

NY
Fichier écrit :  /home/ucsddse230/work/PA1_collinear_points/Data /home/ucsddse230/work/PA1_collinear_points/Data/US_weather_NY.parquet NY
