In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession.\
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

spark

In [7]:
df = spark.read.format("csv").\
option("header","true").\
option("inferSchema","true").\
load("/public/yelp-dataset/yelp_user.csv")

In [8]:
df.show(5)

+--------------------+------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|             user_id|  name|review_count|yelping_since|             friends|useful|funny|cool|fans|elite|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|
+--------------------+------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|JJ-aSuM4pCFPdkfoZ...| Chris|          10|   2013-09-24|0njfJmB-7n84DlIgU...|     0| 

In [9]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



In [20]:
df = spark.read.format("csv").\
option("header","true").\
option("inferSchema","true").\
option("samplingRatio",.1).\
load("/public/yelp-dataset/yelp_user.csv")

#how much sample of the data do you want it to parse in order to detect the datatypes -- random samples 10% in this case

In [21]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



In [22]:
df = spark.read.format("csv").\
load("/public/trendytech/datasets/orders_sample1.csv")

In [23]:
df.show()

+---+----------+-----+---------------+
|_c0|       _c1|  _c2|            _c3|
+---+----------+-----+---------------+
|  1|2013-07-25|11599|         CLOSED|
|  2|2013-07-25|  256|PENDING_PAYMENT|
|  3|2013-07-25|12111|       COMPLETE|
|  4|2013-07-25| 8827|         CLOSED|
|  5|2013-07-25|11318|       COMPLETE|
|  6|2013-07-25| 7130|       COMPLETE|
|  7|2013-07-25| 4530|       COMPLETE|
|  8|2013-07-25| 2911|     PROCESSING|
|  9|2013-07-25| 5657|PENDING_PAYMENT|
| 10|2013-07-25| 5648|PENDING_PAYMENT|
+---+----------+-----+---------------+



In [24]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [25]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [26]:
df = spark.read.format("csv").\
schema(orders_schema).\
load("/public/trendytech/datasets/orders_sample1.csv")

In [27]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [28]:
df.show(5)

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
+--------+----------+-------+---------------+
only showing top 5 rows



In [29]:
from pyspark.sql.types import *

In [30]:
orders_schema_struct = StructType([
    StructField("orderId",LongType()),
    StructField("orderDate",DateType()),
    StructField("custId",IntegerType()),
    StructField("orderStatus",StringType())    
])

In [31]:
df = spark.read.format("csv").\
schema(orders_schema_struct).\
load("/public/trendytech/datasets/orders_sample1.csv")

In [32]:
df.show(5)

+-------+----------+------+---------------+
|orderId| orderDate|custId|    orderStatus|
+-------+----------+------+---------------+
|      1|2013-07-25| 11599|         CLOSED|
|      2|2013-07-25|   256|PENDING_PAYMENT|
|      3|2013-07-25| 12111|       COMPLETE|
|      4|2013-07-25|  8827|         CLOSED|
|      5|2013-07-25| 11318|       COMPLETE|
+-------+----------+------+---------------+
only showing top 5 rows



In [33]:
df.printSchema()

root
 |-- orderId: long (nullable = true)
 |-- orderDate: date (nullable = true)
 |-- custId: integer (nullable = true)
 |-- orderStatus: string (nullable = true)

