# DataFrame

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('spark').getOrCreate()

In [12]:
df = spark.createDataFrame(
    [
        ("Marcelina", "Tetlak", 32),
        ("Anna", "Radomska", 42),
        
    ],
    ['first', 'last', 'age']
)

In [13]:
df.show()

+---------+--------+---+
|    first|    last|age|
+---------+--------+---+
|Marcelina|  Tetlak| 32|
|     Anna|Radomska| 42|
+---------+--------+---+



# Read csv

In [None]:
# data/best_selling_books.csv

In [16]:
csv1 = spark.read.format('csv').load('data/best_selling_books.csv')
csv1.show()

+--------------------+--------------------+-----------------+---------------+--------------------+--------------------+
|                 _c0|                 _c1|              _c2|            _c3|                 _c4|                 _c5|
+--------------------+--------------------+-----------------+---------------+--------------------+--------------------+
|                Book|           Author(s)|Original language|First published|Approximate sales...|               Genre|
|A Tale of Two Cities|     Charles Dickens|          English|           1859|                 200|  Historical fiction|
|The Little Prince...|Antoine de Saint-...|           French|           1943|                 200|             Novella|
|Harry Potter and ...|       J. K. Rowling|          English|           1997|                 120|             Fantasy|
|And Then There We...|     Agatha Christie|          English|           1939|                 100|             Mystery|
|Dream of the Red ...|          Cao Xueq

In [17]:
csv2 = spark.read.format('csv').load('data/country-codes.csv')
csv2.show()

+--------------------+
|                 _c0|
+--------------------+
|  Afghanistan;AF;AFG|
|Åland Islands;AX;ALA|
|      Albania;AL;ALB|
|      Algeria;DZ;DZA|
|American Samoa;AS...|
|      Andorra;AD;AND|
|       Angola;AO;AGO|
|     Anguilla;AI;AIA|
|   Antarctica;AQ;ATA|
|Antigua and Barbu...|
|    Argentina;AR;ARG|
|      Armenia;AM;ARM|
|        Aruba;AW;ABW|
|    Australia;AU;AUS|
|      Austria;AT;AUT|
|   Azerbaijan;AZ;AZE|
|      Bahamas;BS;BHS|
|      Bahrain;BH;BHR|
|   Bangladesh;BD;BGD|
|     Barbados;BB;BRB|
+--------------------+
only showing top 20 rows



In [20]:
csv3 = (
    spark.read
    .format('csv')
    .options(header=True, sep=",")
    .load('data/best_selling_books.csv')
)
csv3.show()

+--------------------+--------------------+-----------------+---------------+-----------------------------+--------------------+
|                Book|           Author(s)|Original language|First published|Approximate sales in millions|               Genre|
+--------------------+--------------------+-----------------+---------------+-----------------------------+--------------------+
|A Tale of Two Cities|     Charles Dickens|          English|           1859|                          200|  Historical fiction|
|The Little Prince...|Antoine de Saint-...|           French|           1943|                          200|             Novella|
|Harry Potter and ...|       J. K. Rowling|          English|           1997|                          120|             Fantasy|
|And Then There We...|     Agatha Christie|          English|           1939|                          100|             Mystery|
|Dream of the Red ...|          Cao Xueqin|          Chinese|           1791|                    

In [21]:
csv4 = (
    spark.read
    .format('csv')
    .options(header=False, sep=";")
    .load('data/country-codes.csv')
)
csv4.show()

+-------------------+---+---+
|                _c0|_c1|_c2|
+-------------------+---+---+
|        Afghanistan| AF|AFG|
|      Åland Islands| AX|ALA|
|            Albania| AL|ALB|
|            Algeria| DZ|DZA|
|     American Samoa| AS|ASM|
|            Andorra| AD|AND|
|             Angola| AO|AGO|
|           Anguilla| AI|AIA|
|         Antarctica| AQ|ATA|
|Antigua and Barbuda| AG|ATG|
|          Argentina| AR|ARG|
|            Armenia| AM|ARM|
|              Aruba| AW|ABW|
|          Australia| AU|AUS|
|            Austria| AT|AUT|
|         Azerbaijan| AZ|AZE|
|            Bahamas| BS|BHS|
|            Bahrain| BH|BHR|
|         Bangladesh| BD|BGD|
|           Barbados| BB|BRB|
+-------------------+---+---+
only showing top 20 rows



# Schema DataFrame

In [4]:
df = spark.read.csv("data/Games.csv", header=True, quote="\"")
df.show()

+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+
|                Name|Sales|              Series|Release|               Genre|           Developer|           Publisher|
+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+
|PlayerUnknown's B...|   42|                NULL| Dec-17|       Battle royale|        PUBG Studios|             Krafton|
|           Minecraft|   33|           Minecraft| Nov-11|   Sandbox, survival|      Mojang Studios|      Mojang Studios|
|          Diablo III|   20|              Diablo| May-12| Action role-playing|Blizzard Entertai...|Blizzard Entertai...|
|         Garry's Mod|   20|                NULL| Nov-06|             Sandbox|   Facepunch Studios|               Valve|
|            Terraria| 17.2|                NULL| May-11|    Action-adventure|            Re-Logic|            Re-Logic|
|   World of Warcraft|   14|    

In [5]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- Release: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Publisher: string (nullable = true)



# Tworzenie Schema

In [6]:
spark.read.csv("data/best_selling_books.csv", header=True).printSchema()

root
 |-- Book: string (nullable = true)
 |-- Author(s): string (nullable = true)
 |-- Original language: string (nullable = true)
 |-- First published: string (nullable = true)
 |-- Approximate sales in millions: string (nullable = true)
 |-- Genre: string (nullable = true)



In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [9]:
schema = StructType(
    [
        StructField("Book", StringType(), False),
        StructField("Authors", StringType(), False),
        StructField("Original Language", StringType(), False),
        StructField("First published", IntegerType(), False),
        StructField("Sales", DoubleType(), False),
        StructField("Genre", StringType(), False)
    ]
)

In [10]:
print(schema)

StructType([StructField('Book', StringType(), False), StructField('Authors', StringType(), False), StructField('Original Language', StringType(), False), StructField('First published', IntegerType(), False), StructField('Sales', DoubleType(), False), StructField('Genre', StringType(), False)])


In [11]:
df = spark.read.csv("data/best_selling_books.csv", header=True, schema=schema)
df.printSchema()

root
 |-- Book: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- Original Language: string (nullable = true)
 |-- First published: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Genre: string (nullable = true)



In [12]:
csv1 = spark.read.format("csv").schema(schema).load("data/best_selling_books.csv")
csv1.printSchema()

root
 |-- Book: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- Original Language: string (nullable = true)
 |-- First published: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Genre: string (nullable = true)



In [14]:
csv1 = spark.read.format("csv").schema(schema).load("data/best_selling_books.csv")
csv1.show()

+--------------------+--------------------+-----------------+---------------+-----+--------------------+
|                Book|             Authors|Original Language|First published|Sales|               Genre|
+--------------------+--------------------+-----------------+---------------+-----+--------------------+
|                Book|           Author(s)|Original language|           NULL| NULL|               Genre|
|A Tale of Two Cities|     Charles Dickens|          English|           1859|200.0|  Historical fiction|
|The Little Prince...|Antoine de Saint-...|           French|           1943|200.0|             Novella|
|Harry Potter and ...|       J. K. Rowling|          English|           1997|120.0|             Fantasy|
|And Then There We...|     Agatha Christie|          English|           1939|100.0|             Mystery|
|Dream of the Red ...|          Cao Xueqin|          Chinese|           1791|100.0|         Family saga|
|          The Hobbit|    J. R. R. Tolkien|          En

# Wyświetlanie wybranych kolumn

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import col

In [16]:
spark = SparkSession.builder.appName("spark").getOrCreate()

In [17]:
games_schema = StructType(
    [
        StructField("Name", StringType(), False),
        StructField("Sales", DoubleType(), False),
        StructField("Series", StringType(), True),
        StructField("Release", StringType(), False),
        StructField("Genre", StringType(), False),
        StructField("Developer", StringType(), False),
        StructField("Publisher", StringType(), False)
    ]
)

In [33]:
df = spark.read.csv("data/Games.csv", header=True, schema=games_schema)
df.show()

+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+
|                Name|Sales|              Series|Release|               Genre|           Developer|           Publisher|
+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+
|PlayerUnknown's B...| 42.0|                NULL| Dec-17|       Battle royale|        PUBG Studios|             Krafton|
|           Minecraft| 33.0|           Minecraft| Nov-11|   Sandbox, survival|      Mojang Studios|      Mojang Studios|
|          Diablo III| 20.0|              Diablo| May-12| Action role-playing|Blizzard Entertai...|Blizzard Entertai...|
|         Garry's Mod| 20.0|                NULL| Nov-06|             Sandbox|   Facepunch Studios|               Valve|
|            Terraria| 17.2|                NULL| May-11|    Action-adventure|            Re-Logic|            Re-Logic|
|   World of Warcraft| 14.0|    

In [19]:
df.select (col('name'), col('sales'), col('developer')).show()

+--------------------+-----+--------------------+
|                name|sales|           developer|
+--------------------+-----+--------------------+
|PlayerUnknown's B...| 42.0|        PUBG Studios|
|           Minecraft| 33.0|      Mojang Studios|
|          Diablo III| 20.0|Blizzard Entertai...|
|         Garry's Mod| 20.0|   Facepunch Studios|
|            Terraria| 17.2|            Re-Logic|
|   World of Warcraft| 14.0|Blizzard Entertai...|
|         Half-Life 2| 12.0|               Valve|
|The Witcher 3: Wi...| 12.0|      CD Projekt Red|
|           StarCraft| 11.0|Blizzard Entertai...|
|            The Sims| 11.0|               Maxis|
|           Fall Guys| 10.0|          Mediatonic|
|RollerCoaster Tyc...| 10.0|Frontier Developm...|
|           Half-Life|  9.0|               Valve|
|                Rust|  9.0|   Facepunch Studios|
|      Civilization V|  8.0|       Firaxis Games|
|          The Sims 3|  7.0|               Maxis|
|Euro Truck Simula...|  6.5|        SCS Software|


In [20]:
df.select('name', 'sales', 'developer').show()

+--------------------+-----+--------------------+
|                name|sales|           developer|
+--------------------+-----+--------------------+
|PlayerUnknown's B...| 42.0|        PUBG Studios|
|           Minecraft| 33.0|      Mojang Studios|
|          Diablo III| 20.0|Blizzard Entertai...|
|         Garry's Mod| 20.0|   Facepunch Studios|
|            Terraria| 17.2|            Re-Logic|
|   World of Warcraft| 14.0|Blizzard Entertai...|
|         Half-Life 2| 12.0|               Valve|
|The Witcher 3: Wi...| 12.0|      CD Projekt Red|
|           StarCraft| 11.0|Blizzard Entertai...|
|            The Sims| 11.0|               Maxis|
|           Fall Guys| 10.0|          Mediatonic|
|RollerCoaster Tyc...| 10.0|Frontier Developm...|
|           Half-Life|  9.0|               Valve|
|                Rust|  9.0|   Facepunch Studios|
|      Civilization V|  8.0|       Firaxis Games|
|          The Sims 3|  7.0|               Maxis|
|Euro Truck Simula...|  6.5|        SCS Software|


# Sortowanie danych

In [21]:
df.orderBy(
    col("developer").asc(),
    col("sales").desc()
          ).show()

+--------------------+-----+------------------+-------+--------------------+--------------------+--------------------+
|                Name|Sales|            Series|Release|               Genre|           Developer|           Publisher|
+--------------------+-----+------------------+-------+--------------------+--------------------+--------------------+
|       Duke Nukem 3D|  1.0|        Duke Nukem| Jan-96|First-person shooter|           3D Realms|GT Interactive So...|
|         Machinarium|  1.0|              NULL| Oct-09|Graphic adventure...|      Amanita Design|      Amanita Design|
|          Guild Wars|  6.0|        Guild Wars| Apr-05|              MMORPG|            ArenaNet|              NCsoft|
|        Guild Wars 2|  5.0|        Guild Wars| Aug-12|              MMORPG|            ArenaNet|              NCsoft|
|             Magicka|  2.0|              NULL| Jan-11|    Action-adventure|Arrowhead Game St...| Paradox Interactive|
|Patrician III: Ri...|  1.0|     The Patrician| 

In [22]:
df.show(3, truncate=False)

+-----------------------------+-----+---------+-------+-------------------+----------------------+----------------------+
|Name                         |Sales|Series   |Release|Genre              |Developer             |Publisher             |
+-----------------------------+-----+---------+-------+-------------------+----------------------+----------------------+
|PlayerUnknown's Battlegrounds|42.0 |NULL     |Dec-17 |Battle royale      |PUBG Studios          |Krafton               |
|Minecraft                    |33.0 |Minecraft|Nov-11 |Sandbox, survival  |Mojang Studios        |Mojang Studios        |
|Diablo III                   |20.0 |Diablo   |May-12 |Action role-playing|Blizzard Entertainment|Blizzard Entertainment|
+-----------------------------+-----+---------+-------+-------------------+----------------------+----------------------+
only showing top 3 rows



In [23]:
df.limit(5).show(10)

+--------------------+-----+---------+-------+-------------------+--------------------+--------------------+
|                Name|Sales|   Series|Release|              Genre|           Developer|           Publisher|
+--------------------+-----+---------+-------+-------------------+--------------------+--------------------+
|PlayerUnknown's B...| 42.0|     NULL| Dec-17|      Battle royale|        PUBG Studios|             Krafton|
|           Minecraft| 33.0|Minecraft| Nov-11|  Sandbox, survival|      Mojang Studios|      Mojang Studios|
|          Diablo III| 20.0|   Diablo| May-12|Action role-playing|Blizzard Entertai...|Blizzard Entertai...|
|         Garry's Mod| 20.0|     NULL| Nov-06|            Sandbox|   Facepunch Studios|               Valve|
|            Terraria| 17.2|     NULL| May-11|   Action-adventure|            Re-Logic|            Re-Logic|
+--------------------+-----+---------+-------+-------------------+--------------------+--------------------+



# Limit i collect

In [24]:
df.limit(1).collect()

[Row(Name="PlayerUnknown's Battlegrounds", Sales=42.0, Series=None, Release='Dec-17', Genre='Battle royale', Developer='PUBG Studios', Publisher='Krafton')]

In [27]:
df.limit(1).collect()[0]

Row(Name="PlayerUnknown's Battlegrounds", Sales=42.0, Series=None, Release='Dec-17', Genre='Battle royale', Developer='PUBG Studios', Publisher='Krafton')

In [28]:
df.limit(1).collect()[0][1]

42.0

# Dodawanie kolumny

In [34]:
df = df.withColumn('SalesX1000', col('sales') * 1000 )

In [35]:
df.show()

+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+----------+
|                Name|Sales|              Series|Release|               Genre|           Developer|           Publisher|SalesX1000|
+--------------------+-----+--------------------+-------+--------------------+--------------------+--------------------+----------+
|PlayerUnknown's B...| 42.0|                NULL| Dec-17|       Battle royale|        PUBG Studios|             Krafton|   42000.0|
|           Minecraft| 33.0|           Minecraft| Nov-11|   Sandbox, survival|      Mojang Studios|      Mojang Studios|   33000.0|
|          Diablo III| 20.0|              Diablo| May-12| Action role-playing|Blizzard Entertai...|Blizzard Entertai...|   20000.0|
|         Garry's Mod| 20.0|                NULL| Nov-06|             Sandbox|   Facepunch Studios|               Valve|   20000.0|
|            Terraria| 17.2|                NULL| May-11|    Action-adventur

# Lista i słownik w DataFrame

In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

In [37]:
spark = SparkSession.builder.appName("spark").getOrCreate()

In [38]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True)
    ]
)

In [39]:
emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active"),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired"),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active"),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired"),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New"),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active"),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New")
    ],
    schema
)

In [40]:
emp.printSchema()

root
 |-- id: integer (nullable = false)
 |-- first: string (nullable = false)
 |-- last: string (nullable = false)
 |-- skills: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- salary: integer (nullable = false)
 |-- role: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- status: string (nullable = true)



In [44]:
emp.limit(3).show(truncate=False)

+---+-------+--------+--------------------------+------+----------------------------------------+------+
|id |first  |last    |skills                    |salary|role                                    |status|
+---+-------+--------+--------------------------+------+----------------------------------------+------+
|1  |Adam   |Nowak   |[SQL, Java, GCP]          |3500  |{level -> 1, position -> Java Developer}|NULL  |
|2  |Jan    |Kowalski|[SQL, Java, Azure, Spring]|8000  |{level -> 3, position -> Java Developer}|Active|
|3  |Dominik|Bajt    |[Python, MongoDB, Redis]  |4000  |{level -> 1, position -> Data Developer}|NULL  |
+---+-------+--------+--------------------------+------+----------------------------------------+------+



In [46]:
emp.limit(3).show()

+---+-------+--------+--------------------+------+--------------------+------+
| id|  first|    last|              skills|salary|                role|status|
+---+-------+--------+--------------------+------+--------------------+------+
|  1|   Adam|   Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|
|  2|    Jan|Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|
|  3|Dominik|    Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|
+---+-------+--------+--------------------+------+--------------------+------+



# 5.2. getItem oraz size

In [47]:
emp.select(
    col("skills")[1],
    col("role")["level"]
).show()

+---------+-----------+
|skills[1]|role[level]|
+---------+-----------+
|     Java|          1|
|     Java|          3|
|  MongoDB|          1|
|   Python|          1|
|    CI/CD|          2|
|      GCP|          2|
|    CI/CD|          3|
|     Java|          0|
|     Java|          0|
|   Django|          1|
+---------+-----------+



In [49]:
emp.select(
    col("skills").getItem(1),
    col("role").getItem('position'),
).show()

+---------+----------------+
|skills[1]|  role[position]|
+---------+----------------+
|     Java|  Java Developer|
|     Java|  Java Developer|
|  MongoDB|  Data Developer|
|   Python|  Data Scientist|
|    CI/CD|          DevOps|
|      GCP| Cloud Architect|
|    CI/CD|          DevOps|
|     Java|          Intern|
|     Java|          Intern|
|   Django|Python Developer|
+---------+----------------+



In [50]:
emp.select(
    col("skills").getItem(1),
    col("role").getItem('position'),
    size(col('skills')),
    size(col('role'))
).show()

+---------+----------------+------------+----------+
|skills[1]|  role[position]|size(skills)|size(role)|
+---------+----------------+------------+----------+
|     Java|  Java Developer|           3|         2|
|     Java|  Java Developer|           4|         2|
|  MongoDB|  Data Developer|           3|         2|
|   Python|  Data Scientist|           3|         2|
|    CI/CD|          DevOps|           3|         2|
|      GCP| Cloud Architect|           4|         2|
|    CI/CD|          DevOps|           5|         2|
|     Java|          Intern|           3|         2|
|     Java|          Intern|           3|         2|
|   Django|Python Developer|           3|         2|
+---------+----------------+------------+----------+



# 5.3. lit i explode

In [53]:
emp.withColumn("company", lit('Dziurex')).show()

+---+---------+----------+--------------------+------+--------------------+------+-------+
| id|    first|      last|              skills|salary|                role|status|company|
+---+---------+----------+--------------------+------+--------------------+------+-------+
|  1|     Adam|     Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|Dziurex|
|  2|      Jan|  Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|Dziurex|
|  3|  Dominik|      Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|Dziurex|
|  4|      Ewa|    Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|Dziurex|
|  5|Krzysztof| Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|Dziurex|
|  6|      Ewa| Kierownik|[Azure, GCP, AWS,...| 12500|{level -> 2, posi...| Fired|Dziurex|
|  7|     Adam|  Kowalski|[Git, CI/CD, Dock...| 10500|{level -> 3, posi...|   New|Dziurex|
|  8| Dominika|Praktyczna| [SQL, Java, Python]|  3000|{level -> 0, posi...|  NULL|Dziurex|

In [54]:
emp.select(
    col('id'),
    explode(col('skills'))
).show()

+---+-------+
| id|    col|
+---+-------+
|  1|    SQL|
|  1|   Java|
|  1|    GCP|
|  2|    SQL|
|  2|   Java|
|  2|  Azure|
|  2| Spring|
|  3| Python|
|  3|MongoDB|
|  3|  Redis|
|  4|    SQL|
|  4| Python|
|  4| Pandas|
|  5|    Git|
|  5|  CI/CD|
|  5| Docker|
|  6|  Azure|
|  6|    GCP|
|  6|    AWS|
|  6|  Linux|
+---+-------+
only showing top 20 rows



In [55]:
emp.select(
    col('id'),
    explode(col('role'))
).show()

+---+--------+----------------+
| id|     key|           value|
+---+--------+----------------+
|  1|   level|               1|
|  1|position|  Java Developer|
|  2|   level|               3|
|  2|position|  Java Developer|
|  3|   level|               1|
|  3|position|  Data Developer|
|  4|   level|               1|
|  4|position|  Data Scientist|
|  5|   level|               2|
|  5|position|          DevOps|
|  6|   level|               2|
|  6|position| Cloud Architect|
|  7|   level|               3|
|  7|position|          DevOps|
|  8|   level|               0|
|  8|position|          Intern|
|  9|   level|               0|
|  9|position|          Intern|
| 10|   level|               1|
| 10|position|Python Developer|
+---+--------+----------------+



# 5.4. Konkatenacja

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

spark = SparkSession.builder.appName("spark").getOrCreate()

In [2]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True)
    ]
)

In [3]:
emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active"),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired"),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active"),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired"),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New"),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active"),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New")
    ],
    schema
)

In [6]:
emp.withColumn("employee", concat(col('first'), lit(' '), col('last'))).show()

+---+---------+----------+--------------------+------+--------------------+------+-------------------+
| id|    first|      last|              skills|salary|                role|status|           employee|
+---+---------+----------+--------------------+------+--------------------+------+-------------------+
|  1|     Adam|     Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|         Adam Nowak|
|  2|      Jan|  Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|       Jan Kowalski|
|  3|  Dominik|      Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|       Dominik Bajt|
|  4|      Ewa|    Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|         Ewa Piksel|
|  5|Krzysztof| Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|Krzysztof Zależność|
|  6|      Ewa| Kierownik|[Azure, GCP, AWS,...| 12500|{level -> 2, posi...| Fired|      Ewa Kierownik|
|  7|     Adam|  Kowalski|[Git, CI/CD, Dock...| 10500|{level -> 3, posi..

In [7]:
emp.select(
    concat_ws(',', col('id'), col('first'), col('last'))
).show()

+-----------------------------+
|concat_ws(,, id, first, last)|
+-----------------------------+
|                 1,Adam,Nowak|
|               2,Jan,Kowalski|
|               3,Dominik,Bajt|
|                 4,Ewa,Piksel|
|         5,Krzysztof,Zależ...|
|              6,Ewa,Kierownik|
|              7,Adam,Kowalski|
|         8,Dominika,Prakty...|
|             9,Jan,Praktyczny|
|          10,Mikołaj,Sobieski|
+-----------------------------+



# 5.5. substring

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

spark = SparkSession.builder.appName("spark").getOrCreate()

In [9]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True)
    ]
)

In [10]:
emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active"),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired"),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active"),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired"),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New"),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active"),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New")
    ],
    schema
)

In [14]:
emp.select(
    substring(col('first'), 0, 2),
    col('first')[0:2]
).show()

+----------------------+----------------------+
|substring(first, 0, 2)|substring(first, 0, 2)|
+----------------------+----------------------+
|                    Ad|                    Ad|
|                    Ja|                    Ja|
|                    Do|                    Do|
|                    Ew|                    Ew|
|                    Kr|                    Kr|
|                    Ew|                    Ew|
|                    Ad|                    Ad|
|                    Do|                    Do|
|                    Ja|                    Ja|
|                    Mi|                    Mi|
+----------------------+----------------------+



In [17]:
emp.select(
    substring(col('first'), 5, 2),
    col('first')[5:2]
).show()

+----------------------+----------------------+
|substring(first, 5, 2)|substring(first, 5, 2)|
+----------------------+----------------------+
|                      |                      |
|                      |                      |
|                    ni|                    ni|
|                      |                      |
|                    sz|                    sz|
|                      |                      |
|                      |                      |
|                    ni|                    ni|
|                      |                      |
|                    ła|                    ła|
+----------------------+----------------------+



## 5.6. DateType i TimestampType

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

from datetime import datetime

spark = SparkSession.builder.appName("spark").getOrCreate()

In [27]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True),
        StructField("hire_date", DateType(), True),
        StructField("hire_timestamp", TimestampType(), True)
    ]
)

emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None, 
         datetime(2023, 5, 1), datetime(2023, 5, 1, 12, 0, 0)),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active", 
         datetime(2023, 5, 10), datetime(2023, 5, 10, 16, 0, 0)),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None, 
         datetime(2023, 5, 15), datetime(2023, 5, 15, 8, 0, 0)),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired", 
         datetime(2023, 6, 10), datetime(2023, 6, 1, 11, 0, 0)),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active", 
         datetime(2023, 6, 15), datetime(2023, 6, 15, 11, 30, 0)),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired", 
         datetime(2023, 6, 20), datetime(2023, 6, 20, 12, 0)),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 9, 0)),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None, 
         datetime(2023, 1, 30), datetime(2023, 1, 30, 7, 0)),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active", 
         datetime(2023, 3, 20), datetime(2023, 3, 20, 11, 45)),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 8, 35))
    ],
    schema
)

In [28]:
emp.show()

+---+---------+----------+--------------------+------+--------------------+------+----------+-------------------+
| id|    first|      last|              skills|salary|                role|status| hire_date|     hire_timestamp|
+---+---------+----------+--------------------+------+--------------------+------+----------+-------------------+
|  1|     Adam|     Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|2023-05-01|2023-05-01 12:00:00|
|  2|      Jan|  Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|2023-05-10|2023-05-10 16:00:00|
|  3|  Dominik|      Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|2023-05-15|2023-05-15 08:00:00|
|  4|      Ewa|    Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|2023-06-10|2023-06-01 11:00:00|
|  5|Krzysztof| Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|2023-06-15|2023-06-15 11:30:00|
|  6|      Ewa| Kierownik|[Azure, GCP, AWS,...| 12500|{level -> 2, posi...| Fired|2023-0

## 5.7. datediff

In [31]:
emp.select(
    datediff(col('hire_date'), lit(datetime(2023, 9, 1))),
    datediff(lit(datetime(2023, 9, 1)), col('hire_date')) 
).show()

+----------------------------------------------------+----------------------------------------------------+
|datediff(hire_date, TIMESTAMP '2023-09-01 00:00:00')|datediff(TIMESTAMP '2023-09-01 00:00:00', hire_date)|
+----------------------------------------------------+----------------------------------------------------+
|                                                -123|                                                 123|
|                                                -114|                                                 114|
|                                                -109|                                                 109|
|                                                 -83|                                                  83|
|                                                 -78|                                                  78|
|                                                 -73|                                                  73|
|                           

## 5.8. date_add/date_sub

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

from datetime import datetime

spark = SparkSession.builder.appName("spark").getOrCreate()

In [33]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True),
        StructField("hire_date", DateType(), True),
        StructField("hire_timestamp", TimestampType(), True)
    ]
)

emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None, 
         datetime(2023, 5, 1), datetime(2023, 5, 1, 12, 0, 0)),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active", 
         datetime(2023, 5, 10), datetime(2023, 5, 10, 16, 0, 0)),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None, 
         datetime(2023, 5, 15), datetime(2023, 5, 15, 8, 0, 0)),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired", 
         datetime(2023, 6, 10), datetime(2023, 6, 1, 11, 0, 0)),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active", 
         datetime(2023, 6, 15), datetime(2023, 6, 15, 11, 30, 0)),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired", 
         datetime(2023, 6, 20), datetime(2023, 6, 20, 12, 0)),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 9, 0)),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None, 
         datetime(2023, 1, 30), datetime(2023, 1, 30, 7, 0)),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active", 
         datetime(2023, 3, 20), datetime(2023, 3, 20, 11, 45)),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 8, 35))
    ],
    schema
)

In [36]:
emp.select(
    col('hire_date'),
    date_add(col('hire_date'), 10),
    date_sub(col('hire_date'), 30)
).show()

+----------+-----------------------+-----------------------+
| hire_date|date_add(hire_date, 10)|date_sub(hire_date, 30)|
+----------+-----------------------+-----------------------+
|2023-05-01|             2023-05-11|             2023-04-01|
|2023-05-10|             2023-05-20|             2023-04-10|
|2023-05-15|             2023-05-25|             2023-04-15|
|2023-06-10|             2023-06-20|             2023-05-11|
|2023-06-15|             2023-06-25|             2023-05-16|
|2023-06-20|             2023-06-30|             2023-05-21|
|2023-01-20|             2023-01-30|             2022-12-21|
|2023-01-30|             2023-02-09|             2022-12-31|
|2023-03-20|             2023-03-30|             2023-02-18|
|2023-01-20|             2023-01-30|             2022-12-21|
+----------+-----------------------+-----------------------+



In [37]:
emp.select(
    col('hire_date'),
    date_add(col('hire_date'), -10),
    date_sub(col('hire_date'), -30)
).show()

+----------+------------------------+------------------------+
| hire_date|date_add(hire_date, -10)|date_sub(hire_date, -30)|
+----------+------------------------+------------------------+
|2023-05-01|              2023-04-21|              2023-05-31|
|2023-05-10|              2023-04-30|              2023-06-09|
|2023-05-15|              2023-05-05|              2023-06-14|
|2023-06-10|              2023-05-31|              2023-07-10|
|2023-06-15|              2023-06-05|              2023-07-15|
|2023-06-20|              2023-06-10|              2023-07-20|
|2023-01-20|              2023-01-10|              2023-02-19|
|2023-01-30|              2023-01-20|              2023-03-01|
|2023-03-20|              2023-03-10|              2023-04-19|
|2023-01-20|              2023-01-10|              2023-02-19|
+----------+------------------------+------------------------+



## 5.9. Ekstrakcja danej jednostki czasu

In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

from datetime import datetime

spark = SparkSession.builder.appName("spark").getOrCreate()

In [45]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True),
        StructField("hire_date", DateType(), True),
        StructField("hire_timestamp", TimestampType(), True)
    ]
)

emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None, 
         datetime(2023, 5, 1), datetime(2023, 5, 1, 12, 0, 0)),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active", 
         datetime(2023, 5, 10), datetime(2023, 5, 10, 16, 0, 0)),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None, 
         datetime(2023, 5, 15), datetime(2023, 5, 15, 8, 0, 0)),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired", 
         datetime(2023, 6, 10), datetime(2023, 6, 1, 11, 0, 0)),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active", 
         datetime(2023, 6, 15), datetime(2023, 6, 15, 11, 30, 0)),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired", 
         datetime(2023, 6, 20), datetime(2023, 6, 20, 12, 0)),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 9, 0)),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None, 
         datetime(2023, 1, 30), datetime(2023, 1, 30, 7, 0)),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active", 
         datetime(2023, 3, 20), datetime(2023, 3, 20, 11, 45)),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 8, 35))
    ],
    schema
)

In [50]:
(
    emp
    .withColumn( 'year', year(col('hire_timestamp')) )
    .withColumn( 'day_of_week', dayofweek(col('hire_timestamp')) )
    .withColumn( 'week_of_year', weekofyear(col('hire_timestamp')) )
    .withColumn( 'hour', hour(col('hire_timestamp')) )
    .limit(3)
    .show()
)

+---+-------+--------+--------------------+------+--------------------+------+----------+-------------------+----+-----------+------------+----+
| id|  first|    last|              skills|salary|                role|status| hire_date|     hire_timestamp|year|day_of_week|week_of_year|hour|
+---+-------+--------+--------------------+------+--------------------+------+----------+-------------------+----+-----------+------------+----+
|  1|   Adam|   Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|2023-05-01|2023-05-01 12:00:00|2023|          2|          18|  12|
|  2|    Jan|Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|2023-05-10|2023-05-10 16:00:00|2023|          4|          19|  16|
|  3|Dominik|    Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|2023-05-15|2023-05-15 08:00:00|2023|          2|          20|   8|
+---+-------+--------+--------------------+------+--------------------+------+----------+-------------------+----+-----------+----

# Filtrowanie danych

## 6.1. Unikatowe wiersze

In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

from datetime import datetime

spark = SparkSession.builder.appName("spark").getOrCreate()

In [52]:
schema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("first", StringType(), False),
        StructField("last", StringType(), False),
        StructField("skills", ArrayType(StringType()), False),
        StructField("salary", IntegerType(), False),
        StructField("role", MapType(StringType(), StringType()), False),
        StructField("status", StringType(), True),
        StructField("hire_date", DateType(), True),
        StructField("hire_timestamp", TimestampType(), True)
    ]
)

emp = spark.createDataFrame(
    [
        (1, "Adam", "Nowak", ["SQL", "Java", "GCP"], 3500, {"position": "Java Developer", "level": "1"}, None, 
         datetime(2023, 5, 1), datetime(2023, 5, 1, 12, 0, 0)),
        (2, "Jan", "Kowalski", ["SQL", "Java", "Azure", "Spring"], 8000, {"position": "Java Developer", "level": "3"}, "Active", 
         datetime(2023, 5, 10), datetime(2023, 5, 10, 16, 0, 0)),
        (3, "Dominik", "Bajt", ["Python", "MongoDB", "Redis"], 4000, {"position": "Data Developer", "level": "1"}, None, 
         datetime(2023, 5, 15), datetime(2023, 5, 15, 8, 0, 0)),
        (4, "Ewa", "Piksel", ["SQL", "Python", "Pandas", ], 4100, {"position": "Data Scientist", "level": "1"}, "Fired", 
         datetime(2023, 6, 10), datetime(2023, 6, 1, 11, 0, 0)),
        (5, "Krzysztof", "Zależność", ["Git", "CI/CD", "Docker"], 8000, {"position": "DevOps", "level": "2"}, "Active", 
         datetime(2023, 6, 15), datetime(2023, 6, 15, 11, 30, 0)),
        (6, "Ewa", "Kierownik", ["Azure", "GCP", "AWS", "Linux"], 12500, {"position": "Cloud Architect", "level": "2"}, "Fired", 
         datetime(2023, 6, 20), datetime(2023, 6, 20, 12, 0)),
        (7, "Adam", "Kowalski", ["Git", "CI/CD", "Docker", "Linux", "Kubernetes"], 10500, {"position": "DevOps", "level": "3"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 9, 0)),
        (8, "Dominika", "Praktyczna", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, None, 
         datetime(2023, 1, 30), datetime(2023, 1, 30, 7, 0)),
        (9, "Jan", "Praktyczny", ["SQL", "Java", "Python"], 3000, {"position": "Intern", "level": "0"}, "Active", 
         datetime(2023, 3, 20), datetime(2023, 3, 20, 11, 45)),
        (10, "Mikołaj", "Sobieski", ["Python", "Django", "Flask"], 7500, {"position": "Python Developer", "level": "1"}, "New", 
         datetime(2023, 1, 20), datetime(2023, 1, 20, 8, 35))
    ],
    schema
)

In [55]:
emp.printSchema()

root
 |-- id: integer (nullable = false)
 |-- first: string (nullable = false)
 |-- last: string (nullable = false)
 |-- skills: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- salary: integer (nullable = false)
 |-- role: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- status: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- hire_timestamp: timestamp (nullable = true)



In [56]:
games = spark.read.csv('data/Games.csv', header=True).select(col('Series'), col('Developer'), col('Publisher')).orderBy(col('Developer'))

In [60]:
games.limit(10).show(truncate=False)

+------------------+----------------------+-----------------------+
|Series            |Developer             |Publisher              |
+------------------+----------------------+-----------------------+
|Duke Nukem        |3D Realms             |GT Interactive Software|
|NULL              |Amanita Design        |Amanita Design         |
|Guild Wars        |ArenaNet              |NCsoft                 |
|Guild Wars        |ArenaNet              |NCsoft                 |
|NULL              |Arrowhead Game Studios|Paradox Interactive    |
|The Patrician     |Ascaron               |Encore                 |
|Sacred            |Ascaron               |Encore                 |
|Baldur's Gate     |BioWare               |Interplay Entertainment|
|Baldur's Gate     |BioWare               |Interplay Entertainment|
|Neverwinter Nights|BioWare               |Infogrames / Atari     |
+------------------+----------------------+-----------------------+



In [63]:
games.distinct().orderBy(col('Developer')).limit(10).show(truncate=False)

+------------------+----------------------+-----------------------+
|Series            |Developer             |Publisher              |
+------------------+----------------------+-----------------------+
|Duke Nukem        |3D Realms             |GT Interactive Software|
|NULL              |Amanita Design        |Amanita Design         |
|Guild Wars        |ArenaNet              |NCsoft                 |
|NULL              |Arrowhead Game Studios|Paradox Interactive    |
|Sacred            |Ascaron               |Encore                 |
|The Patrician     |Ascaron               |Encore                 |
|Neverwinter Nights|BioWare               |Infogrames / Atari     |
|Baldur's Gate     |BioWare               |Interplay Entertainment|
|Diablo            |Blizzard Entertainment|Blizzard Entertainment |
|Warcraft          |Blizzard Entertainment|Blizzard Entertainment |
+------------------+----------------------+-----------------------+



In [64]:
games.dropDuplicates().orderBy(col('Developer')).limit(10).show(truncate=False)

+------------------+----------------------+-----------------------+
|Series            |Developer             |Publisher              |
+------------------+----------------------+-----------------------+
|Duke Nukem        |3D Realms             |GT Interactive Software|
|NULL              |Amanita Design        |Amanita Design         |
|Guild Wars        |ArenaNet              |NCsoft                 |
|NULL              |Arrowhead Game Studios|Paradox Interactive    |
|Sacred            |Ascaron               |Encore                 |
|The Patrician     |Ascaron               |Encore                 |
|Neverwinter Nights|BioWare               |Infogrames / Atari     |
|Baldur's Gate     |BioWare               |Interplay Entertainment|
|Diablo            |Blizzard Entertainment|Blizzard Entertainment |
|Warcraft          |Blizzard Entertainment|Blizzard Entertainment |
+------------------+----------------------+-----------------------+



In [65]:
games.dropDuplicates(['Series']).orderBy(col('Developer')).limit(10).show(truncate=False)

+------------------+----------------------+-----------------------+
|Series            |Developer             |Publisher              |
+------------------+----------------------+-----------------------+
|Duke Nukem        |3D Realms             |GT Interactive Software|
|NULL              |Amanita Design        |Amanita Design         |
|Guild Wars        |ArenaNet              |NCsoft                 |
|The Patrician     |Ascaron               |Encore                 |
|Sacred            |Ascaron               |Encore                 |
|Baldur's Gate     |BioWare               |Interplay Entertainment|
|Neverwinter Nights|BioWare               |Infogrames / Atari     |
|StarCraft         |Blizzard Entertainment|Blizzard Entertainment |
|Warcraft          |Blizzard Entertainment|Blizzard Entertainment |
|Diablo            |Blizzard Entertainment|Blizzard Entertainment |
+------------------+----------------------+-----------------------+



## 6.2. Filtrowanie danych cz. 1

In [None]:
# data/best_selling_books.csv

In [66]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DateType, TimestampType
from pyspark.sql.functions import (
    col, size, lit, explode, 
    concat, concat_ws, substring, 
    datediff, date_add, date_sub,
    year, month, dayofmonth, dayofweek, dayofyear, weekofyear, 
    hour, minute, second
)

from datetime import datetime

spark = SparkSession.builder.appName("spark").getOrCreate()

In [67]:
books = spark.read.csv('data/best_selling_books.csv', header=True)

In [68]:
books.limit(5).show()

+--------------------+--------------------+-----------------+---------------+-----------------------------+------------------+
|                Book|           Author(s)|Original language|First published|Approximate sales in millions|             Genre|
+--------------------+--------------------+-----------------+---------------+-----------------------------+------------------+
|A Tale of Two Cities|     Charles Dickens|          English|           1859|                          200|Historical fiction|
|The Little Prince...|Antoine de Saint-...|           French|           1943|                          200|           Novella|
|Harry Potter and ...|       J. K. Rowling|          English|           1997|                          120|           Fantasy|
|And Then There We...|     Agatha Christie|          English|           1939|                          100|           Mystery|
|Dream of the Red ...|          Cao Xueqin|          Chinese|           1791|                          100|    

In [69]:
books.filter(
    col('Original language') == 'Portuguese'
    
).show()

+--------------------+------------+-----------------+---------------+-----------------------------+-------+
|                Book|   Author(s)|Original language|First published|Approximate sales in millions|  Genre|
+--------------------+------------+-----------------+---------------+-----------------------------+-------+
|The Alchemist (O ...|Paulo Coelho|       Portuguese|           1988|                           65|Fantasy|
+--------------------+------------+-----------------+---------------+-----------------------------+-------+



In [72]:
books.filter(
    col('First published') > 2010
    
).show(truncate=False)

+-----------------------------------+--------------+-----------------+---------------+-----------------------------+-----------------------------+
|Book                               |Author(s)     |Original language|First published|Approximate sales in millions|Genre                        |
+-----------------------------------+--------------+-----------------+---------------+-----------------------------+-----------------------------+
|The Fault in Our Stars             |John Green    |English          |2012           |23                           |Young adult romantic novel   |
|The Girl on the Train              |Paula Hawkins |English          |2015           |23                           |Thriller                     |
|Gone Girl                          |Gillian Flynn |English          |2012           |20                           |Crime thriller novel         |
|Where the Crawdads Sing            |Delia Owens   |English          |2018           |18                           |Co

In [87]:
emp.limit(5).show()

+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
| id|    first|     last|              skills|salary|                role|status| hire_date|     hire_timestamp|
+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
|  1|     Adam|    Nowak|    [SQL, Java, GCP]|  3500|{level -> 1, posi...|  NULL|2023-05-01|2023-05-01 12:00:00|
|  2|      Jan| Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|2023-05-10|2023-05-10 16:00:00|
|  3|  Dominik|     Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|2023-05-15|2023-05-15 08:00:00|
|  4|      Ewa|   Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|2023-06-10|2023-06-01 11:00:00|
|  5|Krzysztof|Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|2023-06-15|2023-06-15 11:30:00|
+---+---------+---------+--------------------+------+--------------------+------+----------+----

Zapis funkcyjny

In [76]:
emp.filter(
    size(col('skills')) > 3
).show(truncate=False)

+---+-----+---------+---------------------------------------+------+-----------------------------------------+------+----------+-------------------+
|id |first|last     |skills                                 |salary|role                                     |status|hire_date |hire_timestamp     |
+---+-----+---------+---------------------------------------+------+-----------------------------------------+------+----------+-------------------+
|2  |Jan  |Kowalski |[SQL, Java, Azure, Spring]             |8000  |{level -> 3, position -> Java Developer} |Active|2023-05-10|2023-05-10 16:00:00|
|6  |Ewa  |Kierownik|[Azure, GCP, AWS, Linux]               |12500 |{level -> 2, position -> Cloud Architect}|Fired |2023-06-20|2023-06-20 12:00:00|
|7  |Adam |Kowalski |[Git, CI/CD, Docker, Linux, Kubernetes]|10500 |{level -> 3, position -> DevOps}         |New   |2023-01-20|2023-01-20 09:00:00|
+---+-----+---------+---------------------------------------+------+--------------------------------------

Zapis SQL

In [86]:
emp.filter(
    "last == 'Kowalski' "
).show()

+---+-----+--------+--------------------+------+--------------------+------+----------+-------------------+
| id|first|    last|              skills|salary|                role|status| hire_date|     hire_timestamp|
+---+-----+--------+--------------------+------+--------------------+------+----------+-------------------+
|  2|  Jan|Kowalski|[SQL, Java, Azure...|  8000|{level -> 3, posi...|Active|2023-05-10|2023-05-10 16:00:00|
|  7| Adam|Kowalski|[Git, CI/CD, Dock...| 10500|{level -> 3, posi...|   New|2023-01-20|2023-01-20 09:00:00|
+---+-----+--------+--------------------+------+--------------------+------+----------+-------------------+



In [91]:
emp.filter(
    "hire_date > '2023-05-10' "
).show()

+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
| id|    first|     last|              skills|salary|                role|status| hire_date|     hire_timestamp|
+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
|  3|  Dominik|     Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|2023-05-15|2023-05-15 08:00:00|
|  4|      Ewa|   Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|2023-06-10|2023-06-01 11:00:00|
|  5|Krzysztof|Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|2023-06-15|2023-06-15 11:30:00|
|  6|      Ewa|Kierownik|[Azure, GCP, AWS,...| 12500|{level -> 2, posi...| Fired|2023-06-20|2023-06-20 12:00:00|
+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+



In [92]:
emp.filter(
    col('hire_date') > datetime(2023, 5, 10)
).show()

+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
| id|    first|     last|              skills|salary|                role|status| hire_date|     hire_timestamp|
+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+
|  3|  Dominik|     Bajt|[Python, MongoDB,...|  4000|{level -> 1, posi...|  NULL|2023-05-15|2023-05-15 08:00:00|
|  4|      Ewa|   Piksel|[SQL, Python, Pan...|  4100|{level -> 1, posi...| Fired|2023-06-10|2023-06-01 11:00:00|
|  5|Krzysztof|Zależność|[Git, CI/CD, Docker]|  8000|{level -> 2, posi...|Active|2023-06-15|2023-06-15 11:30:00|
|  6|      Ewa|Kierownik|[Azure, GCP, AWS,...| 12500|{level -> 2, posi...| Fired|2023-06-20|2023-06-20 12:00:00|
+---+---------+---------+--------------------+------+--------------------+------+----------+-------------------+



## 6.3. Filtrowanie danych cz. 2