## Dataframe Basics

In [1]:
# Import findspark and initialize. 
import findspark
findspark.init()

In [2]:
# Start Spark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/07 11:31:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Create DataFrame manually
dataframe = spark.createDataFrame([
    (0, "Here is our DataFrame"),
    (1, "We are making one from scratch"),
    (2, "This will look very similar to a Pandas DataFrame")
], ["id", "words"])

dataframe.show()

                                                                                

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|Here is our DataF...|
|  1|We are making one...|
|  2|This will look ve...|
+---+--------------------+



In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/1/food.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+-------+-----+
|   food|price|
+-------+-----+
|  pizza|    0|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [5]:
# Print our schema


root
 |-- food: string (nullable = true)
 |-- price: string (nullable = true)



In [6]:
# Show the columns


['food', 'price']

In [7]:
# Describe our data


DataFrame[summary: string, food: string, price: string]

In [8]:
# Import struct fields that we can use
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [9]:
# Next we need to create the list of struct fields


[StructField(food,StringType,true), StructField(price,IntegerType,true)]

In [10]:
# Pass in our fields


StructType(List(StructField(food,StringType,true),StructField(price,IntegerType,true)))

In [11]:
# Read our data with our new schema


+-------+-----+
|   food|price|
+-------+-----+
|  pizza|    0|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [12]:
# Print it out


root
 |-- food: string (nullable = true)
 |-- price: integer (nullable = true)



### Accessing data

Column<'price'>

pyspark.sql.column.Column

DataFrame[price: int]

pyspark.sql.dataframe.DataFrame

+-----+
|price|
+-----+
|    0|
|   12|
|   10|
+-----+



### Manipulating Columns

In [18]:
# Add new column


+-------+-----+--------+
|   food|price|newprice|
+-------+-----+--------+
|  pizza|    0|       0|
|  sushi|   12|      12|
|chinese|   10|      10|
+-------+-----+--------+



In [19]:
# Update column name


+-------+----------+
|   food|newerprice|
+-------+----------+
|  pizza|         0|
|  sushi|        12|
|chinese|        10|
+-------+----------+



In [20]:
# Double the price


+-------+-----+-----------+
|   food|price|doubleprice|
+-------+-----+-----------+
|  pizza|    0|          0|
|  sushi|   12|         24|
|chinese|   10|         20|
+-------+-----+-----------+



In [21]:
# Add a dollar to the price


+-------+-----+--------------+
|   food|price|add_one_dollar|
+-------+-----+--------------+
|  pizza|    0|             1|
|  sushi|   12|            13|
|chinese|   10|            11|
+-------+-----+--------------+



In [22]:
# Half the price


+-------+-----+----------+
|   food|price|half_price|
+-------+-----+----------+
|  pizza|    0|       0.0|
|  sushi|   12|       6.0|
|chinese|   10|       5.0|
+-------+-----+----------+



In [23]:
# Collecting a column as a list


[Row(price=0), Row(price=12), Row(price=10)]

# Converting PySpark DataFrame to Pandas DataFrame

In [24]:
import pandas as pd
pandas_df = dataframe.toPandas() 

In [25]:
pandas_df.head()

Unnamed: 0,food,price
0,pizza,0
1,sushi,12
2,chinese,10
