In [1]:
import findspark
findspark.init()

In [2]:

"""
Since Spark 2.x, Spark unified Spark APIs, DF, Datasets, & SQL.
SparkSession uses SparkContext internally.
"""

from pyspark.sql import SparkSession
ss = SparkSession.builder.master("local").appName("sparkDataFrame").getOrCreate()

ss

In [7]:
!dir ..\data

 Volume in drive C has no label.
 Volume Serial Number is A88C-3222

 Directory of C:\Users\Administrator\veena\data

05/04/2021  08:41 AM    <DIR>          .
05/04/2021  08:41 AM    <DIR>          ..
04/30/2021  09:02 PM            38,049 all_us_counties.csv
04/30/2021  09:00 PM               656 all_us_states.csv
04/30/2021  09:02 PM         2,072,181 all_us_zipcodes.csv
               3 File(s)      2,110,886 bytes
               2 Dir(s)  449,245,831,168 bytes free


---

In [17]:

"""
Read data from CSV
"""

df_us_states = ss.read.format("csv").load("../data/all_us_states.csv")

In [20]:
help(ss.read.load)

Help on method load in module pyspark.sql.readwriter:

load(path=None, format=None, schema=None, **options) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads data from a data source and returns it as a :class`DataFrame`.
    
    :param path: optional string or a list of string for file-system backed data sources.
    :param format: optional string for format of the data source. Default to 'parquet'.
    :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema
                   or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
    :param options: all other string options
    
    >>> df = spark.read.format("parquet").load('python/test_support/sql/parquet_partitioned',
    ...     opt1=True, opt2=1, opt3='str')
    >>> df.dtypes
    [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
    
    >>> df = spark.read.format('json').load(['python/test_support/sql/people.json',
    ...     'python/test_support/sq

In [15]:
df_us_states.first()

Row(_c0='abbr', _c1='name')

In [18]:
df_us_states.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [23]:
df_us_states.dtypes

[('_c0', 'string'), ('_c1', 'string')]

---

In [24]:
df_us_states = (
    ss
    .read
    .format("csv")
    .option("header", True)
    .load("../data/all_us_states.csv")
)

In [25]:
df_us_states.printSchema()

root
 |-- abbr: string (nullable = true)
 |-- name: string (nullable = true)



---

In [36]:
help(ss.read.option("encoding", "utf-8"))

Help on DataFrameReader in module pyspark.sql.readwriter object:

class DataFrameReader(OptionUtils)
 |  DataFrameReader(spark)
 |  
 |  Interface used to load a :class:`DataFrame` from external storage systems
 |  (e.g. file systems, key-value stores, etc). Use :attr:`SparkSession.read`
 |  to access this.
 |  
 |  .. versionadded:: 1.4
 |  
 |  Method resolution order:
 |      DataFrameReader
 |      OptionUtils
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, spark)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLi

In [27]:
df_us_zipcodes = (
    ss
    .read
    .format("csv")
    .option("header", True)
    .load("../data/all_us_zipcodes.csv")
)

"""
Notice that RDD DF creates all column types as String by default.
"""
df_us_zipcodes.printSchema()

root
 |-- code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- area_code: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lon: string (nullable = true)



In [28]:
"""
Enable Spark to infer schema *NOT GOOD practice*
"""
df_us_zipcodes = (
    ss
    .read
    .format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("../data/all_us_zipcodes.csv")
)

"""
Notice that RDD DF creates all column types as String by default.
"""
df_us_zipcodes.printSchema()

root
 |-- code: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- area_code: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)



In [37]:
"""
Provide custom schema *GOOD practice*
"""
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType

schema_zipcode = (
    StructType()
    .add("code", IntegerType(), True)
    .add("city", StringType(), True)
    .add("state", StringType(), True)
    .add("county", StringType(), True)
    .add("area_code", StringType(), True)
    .add("lat", DoubleType(), True)
    .add("lon", DoubleType(), True)
)

df_us_zipcodes = (
    ss
    .read
    .format("csv")
    .schema(schema_zipcode)
    .option("header", True)
    .load("../data/all_us_zipcodes.csv")
)

df_us_zipcodes.printSchema()

root
 |-- code: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- area_code: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)



In [43]:
"""
Access DF schema 
"""
print("-- DF Schema --")
for s in df_us_zipcodes.schema:
    print(s)

-- DF Schema --
StructField(code,IntegerType,true)
StructField(city,StringType,true)
StructField(state,StringType,true)
StructField(county,StringType,true)
StructField(area_code,StringType,true)
StructField(lat,DoubleType,true)
StructField(lon,DoubleType,true)


In [45]:
import pyspark
help(pyspark.sql.types)

Help on module pyspark.sql.types in pyspark.sql:

NAME
    pyspark.sql.types

DESCRIPTION
    # Licensed to the Apache Software Foundation (ASF) under one or more
    # contributor license agreements.  See the NOTICE file distributed with
    # this work for additional information regarding copyright ownership.
    # The ASF licenses this file to You under the Apache License, Version 2.0
    # (the "License"); you may not use this file except in compliance with
    # the License.  You may obtain a copy of the License at
    #
    #    http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

CLASSES
    builtins.object
        DataType
            ArrayType
