## Starting Spark session

In [5]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

## Reading the data

In [11]:
# reading csv file with headers
data_path = "/content/drive/MyDrive/dataset/netflix_titles.csv"
data = spark.read.option('header','true').csv(data_path, inferSchema=True)

In [12]:
data.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                null|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [13]:
type(data)

pyspark.sql.dataframe.DataFrame

In [14]:
# viewing the top rows
data.head(3)

[Row(show_id='s1', type='Movie', title='Dick Johnson Is Dead', director='Kirsten Johnson', cast=None, country='United States', date_added='September 25, 2021', release_year='2020', rating='PG-13', duration='90 min', listed_in='Documentaries', description='As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'),
 Row(show_id='s2', type='TV Show', title='Blood & Water', director=None, cast='Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng', country='South Africa', date_added='September 24, 2021', release_year='2021', rating='TV-MA', duration='2 Seasons', listed_in='International TV Shows, TV Dramas, TV Mysteries', description='After crossin

In [15]:
# getting the schema 
data.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [17]:
orders_datapath = '/content/drive/MyDrive/dataset/orders.csv'
orders_df = spark.read.csv(
    orders_datapath,
    header = True, 
    inferSchema = True)
orders_df.show()

+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
|Row ID|       Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|   Customer Name|    Segment|         City|          State|      Country|Postal Code|Market|      Region|      Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|    Profit|Shipping Cost|Order Priority|
+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
| 32298| CA-2012-124891|31-07-2012|31-07-2012|      Same Day|

In [18]:
orders_df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Postal Code: integer (nullable = true)
 |-- Market: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Profit: double (nullable = true)
 |-- Shipping Cost: double (nullable = true)
 |-- Order Priority: string (nullable = true)



## Selecting Columns

In [20]:
cols = orders_df.columns
print(cols)

['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'City', 'State', 'Country', 'Postal Code', 'Market', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost', 'Order Priority']


In [23]:
orders_df.select('Ship Mode')

DataFrame[Ship Mode: string]

In [22]:
orders_df.select('Ship Mode').show()

+--------------+
|     Ship Mode|
+--------------+
|      Same Day|
|  Second Class|
|   First Class|
|   First Class|
|      Same Day|
|  Second Class|
|   First Class|
|Standard Class|
|Standard Class|
|  Second Class|
|  Second Class|
|   First Class|
|  Second Class|
|      Same Day|
|  Second Class|
|  Second Class|
|  Second Class|
|Standard Class|
|   First Class|
|  Second Class|
+--------------+
only showing top 20 rows



In [24]:
orders_df.select(['Ship Mode','City'])

DataFrame[Ship Mode: string, City: string]

In [25]:
orders_df.select(['Ship Mode','City']).show()

+--------------+-------------+
|     Ship Mode|         City|
+--------------+-------------+
|      Same Day|New York City|
|  Second Class|   Wollongong|
|   First Class|     Brisbane|
|   First Class|       Berlin|
|      Same Day|        Dakar|
|  Second Class|       Sydney|
|   First Class|      Porirua|
|Standard Class|     Hamilton|
|Standard Class|   Sacramento|
|  Second Class|      Concord|
|  Second Class|   Alexandria|
|   First Class|        Kabul|
|  Second Class|        Jizan|
|      Same Day|       Toledo|
|  Second Class|   Mudanjiang|
|  Second Class|        Paris|
|  Second Class|    Henderson|
|Standard Class|        Prato|
|   First Class|   Townsville|
|  Second Class|       Uvinza|
+--------------+-------------+
only showing top 20 rows



## Datatypes

In [27]:
orders_df.dtypes

[('Row ID', 'int'),
 ('Order ID', 'string'),
 ('Order Date', 'string'),
 ('Ship Date', 'string'),
 ('Ship Mode', 'string'),
 ('Customer ID', 'string'),
 ('Customer Name', 'string'),
 ('Segment', 'string'),
 ('City', 'string'),
 ('State', 'string'),
 ('Country', 'string'),
 ('Postal Code', 'int'),
 ('Market', 'string'),
 ('Region', 'string'),
 ('Product ID', 'string'),
 ('Category', 'string'),
 ('Sub-Category', 'string'),
 ('Product Name', 'string'),
 ('Sales', 'string'),
 ('Quantity', 'string'),
 ('Discount', 'string'),
 ('Profit', 'double'),
 ('Shipping Cost', 'double'),
 ('Order Priority', 'string')]

## Describing dataframe

In [28]:
orders_df.describe()

DataFrame[summary: string, Row ID: string, Order ID: string, Order Date: string, Ship Date: string, Ship Mode: string, Customer ID: string, Customer Name: string, Segment: string, City: string, State: string, Country: string, Postal Code: string, Market: string, Region: string, Product ID: string, Category: string, Sub-Category: string, Product Name: string, Sales: string, Quantity: string, Discount: string, Profit: string, Shipping Cost: string, Order Priority: string]

In [29]:
orders_df.describe().show()

+-------+------------------+------------+----------+----------+--------------+-----------+------------------+-----------+--------------------+------+-----------+------------------+------+------+----------------+----------+------------+--------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+
|summary|            Row ID|    Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|                City| State|    Country|       Postal Code|Market|Region|      Product ID|  Category|Sub-Category|        Product Name|             Sales|         Quantity|           Discount|            Profit|     Shipping Cost|    Order Priority|
+-------+------------------+------------+----------+----------+--------------+-----------+------------------+-----------+--------------------+------+-----------+------------------+------+------+----------------+----------+------------+--------------------+--

## Adding Columns and Droping Columns

In [35]:
# not an implice operations
orders_df.withColumn('toal_amt',orders_df['Sales']+orders_df['Discount']).show()

+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+------------------+
|Row ID|       Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|   Customer Name|    Segment|         City|          State|      Country|Postal Code|Market|      Region|      Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|    Profit|Shipping Cost|Order Priority|          toal_amt|
+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+------------------+
| 32

In [38]:
orders_df.drop('Order Date','Ship Date').show()

+------+---------------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
|Row ID|       Order ID|     Ship Mode|Customer ID|   Customer Name|    Segment|         City|          State|      Country|Postal Code|Market|      Region|      Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|    Profit|Shipping Cost|Order Priority|
+------+---------------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
| 32298| CA-2012-124891|      Same Day|   RH-19495|     Rick Hansen|   Consumer|New York City|       New York|United States|   

## Renaming the columns

In [39]:
orders_df.withColumnRenamed('Order ID','Ord id').show()

+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
|Row ID|         Ord id|Order Date| Ship Date|     Ship Mode|Customer ID|   Customer Name|    Segment|         City|          State|      Country|Postal Code|Market|      Region|      Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|    Profit|Shipping Cost|Order Priority|
+------+---------------+----------+----------+--------------+-----------+----------------+-----------+-------------+---------------+-------------+-----------+------+------------+----------------+---------------+------------+--------------------+--------+--------+--------+----------+-------------+--------------+
| 32298| CA-2012-124891|31-07-2012|31-07-2012|      Same Day|

## Handling missing values