In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=991e0db054c392e158fa9125032245d2bab0adb4d0b90f83d540294995cf4f46
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import requests

In [5]:
#Dowload CSV File

def download_file(url, filename):

    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded successfully and saved as {filename}")
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"OOps: Something Else: {err}")

In [6]:
url = 'https://drive.google.com/uc?id=1zO8ekHWx9U7mrbx_0Hoxxu6od7uxJqWw&export=download'
filename = 'data.csv'

download_file(url, filename)

File downloaded successfully and saved as data.csv


In [7]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()
spark

## Reading a dataset

In [12]:
# consider first row as header
df_pyspark = spark.read.option('header', 'true').csv('data.csv')

## Checking the Schema


In [13]:
df_pyspark.printSchema()

root
 |-- Index: string (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phone 1: string (nullable = true)
 |-- Phone 2: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Subscription Date: string (nullable = true)
 |-- Website: string (nullable = true)



## Reading Data and Setting Headers together

In [14]:
df_pyspark = spark.read.csv('data.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

## Get Columns

In [15]:
df_pyspark.head(3)

[Row(Index=1, Customer Id='DD37Cf93aecA6Dc', First Name='Sheryl', Last Name='Baxter', Company='Rasmussen Group', City='East Leonard', Country='Chile', Phone 1='229.077.5154', Phone 2='397.884.0519x718', Email='zunigavanessa@smith.info', Subscription Date=datetime.date(2020, 8, 24), Website='http://www.stephenson.com/'),
 Row(Index=2, Customer Id='1Ef7b82A4CAAD10', First Name='Preston', Last Name='Lozano', Company='Vega-Gentry', City='East Jimmychester', Country='Djibouti', Phone 1='5153435776', Phone 2='686-620-1820x944', Email='vmata@colon.com', Subscription Date=datetime.date(2021, 4, 23), Website='http://www.hobbs.com/'),
 Row(Index=3, Customer Id='6F94879bDAfE5a6', First Name='Roy', Last Name='Berry', Company='Murillo-Perry', City='Isabelborough', Country='Antigua and Barbuda', Phone 1='+1-539-402-0259', Phone 2='(496)978-3969x58947', Email='beckycarr@hogan.com', Subscription Date=datetime.date(2020, 3, 25), Website='http://www.lawrence.com/')]

In [17]:
## Select only 1 column

df_pyspark.select('First Name')

DataFrame[First Name: string]

In [19]:
df_pyspark.select('First Name').show(2)

+----------+
|First Name|
+----------+
|    Sheryl|
|   Preston|
+----------+
only showing top 2 rows



In [20]:
## Selecting Multiple Columns

df_pyspark.select(['First Name', 'Last Name'])

DataFrame[First Name: string, Last Name: string]

In [21]:
df_pyspark.select(['First Name', 'Last Name']).show(2)

+----------+---------+
|First Name|Last Name|
+----------+---------+
|    Sheryl|   Baxter|
|   Preston|   Lozano|
+----------+---------+
only showing top 2 rows



## Reading DataType

In [22]:
df_pyspark.dtypes

[('Index', 'int'),
 ('Customer Id', 'string'),
 ('First Name', 'string'),
 ('Last Name', 'string'),
 ('Company', 'string'),
 ('City', 'string'),
 ('Country', 'string'),
 ('Phone 1', 'string'),
 ('Phone 2', 'string'),
 ('Email', 'string'),
 ('Subscription Date', 'date'),
 ('Website', 'string')]

In [24]:
df_pyspark.describe()

DataFrame[summary: string, Index: string, Customer Id: string, First Name: string, Last Name: string, Company: string, City: string, Country: string, Phone 1: string, Phone 2: string, Email: string, Website: string]

## Adding Columns

In [33]:
df_pyspark.withColumn('increased_index', df_pyspark['Index'] + 2).select(['Index','increased_index']).show(2)

+-----+---------------+
|Index|increased_index|
+-----+---------------+
|    1|              3|
|    2|              4|
+-----+---------------+
only showing top 2 rows




## Dropping Columns

In [35]:
df_pyspark = df_pyspark.drop('Phone 2')

df_pyspark.describe()

DataFrame[summary: string, Index: string, Customer Id: string, First Name: string, Last Name: string, Company: string, City: string, Country: string, Phone 1: string, Email: string, Website: string]

## Renaming Columns

In [36]:
df_pyspark.withColumnRenamed('City', 'Home Town').show(2)

+-----+---------------+----------+---------+---------------+-----------------+--------+------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|        Company|        Home Town| Country|     Phone 1|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+---------------+-----------------+--------+------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|Rasmussen Group|     East Leonard|   Chile|229.077.5154|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|    Vega-Gentry|East Jimmychester|Djibouti|  5153435776|     vmata@colon.com|       2021-04-23|http://www.hobbs....|
+-----+---------------+----------+---------+---------------+-----------------+--------+------------+--------------------+-----------------+--------------------+
only showing top 2 rows

