In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=b567b687cd6a06c3aca6ba0a1b65b02fecff18261280758fcfb9c8b39a3c0f5b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import requests

In [3]:
#Dowload CSV File

def download_file(url, filename):

    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded successfully and saved as {filename}")
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"OOps: Something Else: {err}")

In [4]:
url = 'https://raw.githubusercontent.com/MainakRepositor/Datasets/master/F1/drivers.csv'
filename = 'data.csv'

download_file(url, filename)

File downloaded successfully and saved as data.csv


In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()
spark

In [7]:
# consider first row as header
df_pyspark = spark.read.csv('data.csv', header=True, inferSchema=True)
df_pyspark.show(5)

+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|driverId| driverRef|number|code|forename|   surname|       dob|nationality|                 url|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   Lewis|  Hamilton|07/01/1985|    British|http://en.wikiped...|
|       2|  heidfeld|  NULL| HEI|    Nick|  Heidfeld|10/05/1977|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|    Nico|   Rosberg|27/06/1985|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|Fernando|    Alonso|29/07/1981|    Spanish|http://en.wikiped...|
|       5|kovalainen|  NULL| KOV|  Heikki|Kovalainen|19/10/1981|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
only showing top 5 rows



## Filter Operations

In [9]:
# Drivers with number <= 10

df_pyspark.filter("number<=10").show()

+--------+---------+------+----+---------+----------+----------+-----------+--------------------+
|driverId|driverRef|number|code| forename|   surname|       dob|nationality|                 url|
+--------+---------+------+----+---------+----------+----------+-----------+--------------------+
|       3|  rosberg|     6| ROS|     Nico|   Rosberg|27/06/1985|     German|http://en.wikiped...|
|       8|raikkonen|     7| RAI|     Kimi|R�_ikk̦nen|17/10/1979|    Finnish|http://en.wikiped...|
|      20|   vettel|     5| VET|Sebastian|    Vettel|03/07/1987|     German|http://en.wikiped...|
|     154| grosjean|     8| GRO|   Romain|  Grosjean|17/04/1986|     French|http://en.wikiped...|
|     155|kobayashi|    10| KOB|    Kamui| Kobayashi|13/09/1986|   Japanese|http://en.wikiped...|
|     842|    gasly|    10| GAS|   Pierre|     Gasly|07/02/1996|     French|http://en.wikiped...|
|     817|ricciardo|     3| RIC|   Daniel| Ricciardo|01/07/1989| Australian|http://en.wikiped...|
|     820|  chilton|

Display only particular columns (Like SELECT from SQL)

In [12]:
df_pyspark.filter("number<=10").select(["forename","driverRef"]).show()

+---------+---------+
| forename|driverRef|
+---------+---------+
|     Nico|  rosberg|
|     Kimi|raikkonen|
|Sebastian|   vettel|
|   Romain| grosjean|
|    Kamui|kobayashi|
|   Pierre|    gasly|
|   Daniel|ricciardo|
|      Max|  chilton|
|   Marcus| ericsson|
|  Stoffel|vandoorne|
+---------+---------+



Multiple conditions for filtering

In [19]:
# &

df_pyspark.filter(
    (df_pyspark["number"] <= 10) & ( df_pyspark["nationality"] == "German" ) ).select(["driverRef", "nationality"]).show()

+---------+-----------+
|driverRef|nationality|
+---------+-----------+
|  rosberg|     German|
|   vettel|     German|
+---------+-----------+



In [20]:
# |

df_pyspark.filter(
    (df_pyspark["number"] <= 10) | ( df_pyspark["nationality"] == "German" ) ).select(["driverRef", "nationality"]).show()

+------------------+-----------+
|         driverRef|nationality|
+------------------+-----------+
|          heidfeld|     German|
|           rosberg|     German|
|         raikkonen|    Finnish|
|             glock|     German|
|             sutil|     German|
|            vettel|     German|
|   ralf_schumacher|     German|
| markus_winkelhock|     German|
|michael_schumacher|     German|
|          frentzen|     German|
|           bartels|     German|
|         schneider|     German|
|          grosjean|     French|
|         kobayashi|   Japanese|
|            danner|     German|
|           weidler|     German|
|joachim_winkelhock|     German|
|manfred_winkelhock|     German|
|            bellof|     German|
|              mass|     German|
+------------------+-----------+
only showing top 20 rows



In [21]:
# NOT opeartion ~

df_pyspark.filter(~(df_pyspark['number'] >=5)).show()

+--------+---------+------+----+--------+---------+----------+-----------+--------------------+
|driverId|driverRef|number|code|forename|  surname|       dob|nationality|                 url|
+--------+---------+------+----+--------+---------+----------+-----------+--------------------+
|     817|ricciardo|     3| RIC|  Daniel|Ricciardo|01/07/1989| Australian|http://en.wikiped...|
|     820|  chilton|     4| CHI|     Max|  Chilton|21/04/1991|    British|http://en.wikiped...|
|     838|vandoorne|     2| VAN| Stoffel|Vandoorne|26/03/1992|    Belgian|http://en.wikiped...|
+--------+---------+------+----+--------+---------+----------+-----------+--------------------+

