<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Spark-Session" data-toc-modified-id="Spark-Session-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Spark Session</a></span></li><li><span><a href="#Spark-Dataframes" data-toc-modified-id="Spark-Dataframes-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Spark Dataframes</a></span><ul class="toc-item"><li><span><a href="#Create-a-Dataframe-from-a-csv-file" data-toc-modified-id="Create-a-Dataframe-from-a-csv-file-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Create a Dataframe from a csv file</a></span></li><li><span><a href="#Create-a-Dataframe-from-a-pd.DataFrame" data-toc-modified-id="Create-a-Dataframe-from-a-pd.DataFrame-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Create a Dataframe from a pd.DataFrame</a></span></li><li><span><a href="#Saving-a-Dataframe" data-toc-modified-id="Saving-a-Dataframe-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Saving a Dataframe</a></span></li><li><span><a href="#Saving-as-a-Partitioned-Table" data-toc-modified-id="Saving-as-a-Partitioned-Table-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Saving as a Partitioned Table</a></span></li></ul></li></ul></div>

# Imports

In [1]:
from pyspark.sql import SparkSession
import pandas as pd

# Spark Session

In [2]:
# instantiating a spark session
spark = SparkSession.builder.getOrCreate()

# Options:
#conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])


# Spark Dataframes

## Create a Dataframe from a csv file

In [3]:
ship = spark.read.csv('trekdata.csv',header=True)

In [4]:
ship.show()

+---------+-----------+-----------------+-----+
| registry|  ship_name|            class|speed|
+---------+-----------+-----------------+-----+
| NCC-1680|     Hoover|  Fighter Carrier|  1.5|
|NCC-42254|      Curry|Thru-Deck Carrier|    5|
| NCC-1769|  Armstrong|     Fast Frigate|   10|
| NCC-1640|    Palomar|  Command Cruiser|   20|
|NCC-C1200|     Aurora|          Cruiser|   20|
|NCC-63646|      Akira|       Battleship|   15|
| NCC-1701| Enterprise|    Heavy Cruiser|    5|
| NCC-1700|Contitution|    Heavy Cruiser|    1|
|  NCC-325|      Woden|  Cargo Transport|    3|
|NCC-F1913|      Huron|        Freighter|    8|
+---------+-----------+-----------------+-----+



## Create a Dataframe from a pd.DataFrame

In [7]:
# creating pd.DataFrame
series_data = {'registry' :['NCC-1680','NCC-1769','NCC-1701','NCC-F1913','NCC-42254','NCC-C1200'],
               'series':['Discovery','Original','Original','New Generation','New Generation','New Generation']}
pd_series = pd.DataFrame(series_data)

# transforming pd,DataFrame to pySparkDataframe

series = spark.createDataFrame(pd_series)

## Saving a Dataframe

In [10]:
ship.write.saveAsTable('star_trek_ship_df', format='parquet',mode='overwrite')

## Saving as a Partitioned Table

In [None]:
ship.write.saveAsTable('star_trek_ship_df_part', partitionBy='speed',format='parquet')

# adding data to a partitioned table (previously created)

ship.write.insertInto('star_trek_ship_df_part')

In [12]:
ship_part = spark.sql("select * from star_trek_ship_df_part ")

In [14]:
ship_part.show(5)

+--------+-----------+---------------+-----+
|registry|  ship_name|          class|speed|
+--------+-----------+---------------+-----+
|NCC-1700|Contitution|  Heavy Cruiser|    1|
|NCC-1700|Contitution|  Heavy Cruiser|    1|
|NCC-1640|    Palomar|Command Cruiser|   20|
|NCC-1640|    Palomar|Command Cruiser|   20|
|NCC-1769|  Armstrong|   Fast Frigate|   10|
+--------+-----------+---------------+-----+
only showing top 5 rows



In [15]:
spark.sql("show partitions star_trek_ship_df_part").show()

+---------+
|partition|
+---------+
|speed= 20|
|  speed=1|
|speed=1.5|
| speed=10|
| speed=15|
| speed=20|
|  speed=3|
|  speed=5|
|  speed=8|
+---------+

