### Reading Data

In [1]:
import os

# List files in a directory
files = os.listdir('/users/shubh/Data Engineering/')
print(files)


['.ipynb_checkpoints', '1_Tutoral.ipynb', 'BigMart Sales.csv', 'drivers.json', 'Python', 'Spark_tutorial']


In [2]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .master("local[*]") \
    .getOrCreate()

# Access SparkContext from SparkSession
sc = spark.sparkContext

In [None]:
%config Completer.use_jedi = False

In [3]:
df= spark.read.format('csv').option('inferSchema',True).option('header',True).load('/users/shubh/Data Engineering/BigMart Sales.csv')

In [4]:
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [7]:
df.display()

PySparkAttributeError: [ATTRIBUTE_NOT_SUPPORTED] Attribute `display` is not supported.

### Reading Json Files

In [None]:
df_json = spark.read.format('json').option('inferedSchema', True)\
    .option('header', True)\
        .option('multiLine',False).load('/users/shubh/Data Engineering/drivers.json')

In [None]:
df_json.show()

### Schema Defination

In [None]:
df.printSchema()

### DDL Schema

In [None]:
my_ddl_schema= '''
                    Item_Identifier String,
                    Item_Weight String,
                    Item_Fat_Content String,
                    Item_Visibility Double,
                    Item_Type String,
                    Item_MRP Double,
                    Outlet_Identifier String,
                    Outlet_Establishment_Year Integer,
                    Outlet_Size String,
                    Outlet_Location_Type String,
                    Outlet_Type String,
                    Item_Outlet_Sales Double
'''

In [None]:
df = spark.read.format('csv').schema(my_ddl_schema).option('header', True).load('BigMart Sales.csv')

In [None]:
df.show()

In [None]:
df.printSchema()

### structType() schema

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
my_struct_schema= StructType([
    StructField('Item_Identifier',StringType(), True),
    StructField('Item_Weight',StringType(), True),
    StructField('Item_Fat_Content',StringType() ,True),
    StructField('Item_Visibility',StringType(),True),
    StructField('Item_Type',StringType(), True),
    StructField('Item_MRP',StringType(), True),
    StructField('Outlet_Identifier',StringType(), True),
    StructField('Outlet_Establishment_Year',StringType(), True),
    StructField('Outlet_Size',StringType(), True),
    StructField('Outlet_Location_Type',StringType(), True),
    StructField('Outlet_Type',StringType(), True),
    StructField('Item_Outlet_Sales',StringType(), True),

])

In [None]:
df = spark.read.format('csv').schema(my_struct_schema).option('header', True).load('BigMart Sales.csv')

In [None]:
df.show()

In [None]:
df.printSchema()

%md
###Select

In [None]:
df.display()

In [None]:
df.select('Item_Identifier','Item_Weight','Item_Fat_Content').show()

In [None]:
df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).show()

####Alas


In [None]:
df.select(col('Item_Identifier').alias('Item_ID')).show()

### Filter

## Scenario1

In [None]:
df.show()

In [None]:
df.filter(col('Item_Fat_Content')=='Regular').show()

### Scenario 2

In [None]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).collect()

In [None]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).show()

In [None]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).show(100, truncate=True)

In [None]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).toPandas()

In [None]:
df.filter((col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).limit(10).toPandas()

In [None]:
df.filter(col('Outlet_Location_Type').isin('Tier 1','Tier 2') & col('Outlet_Size').isNull()).limit(20).toPandas()