In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.3/spark-3.1.3-bin-hadoop3.2.tgz
!tar xf spark-3.1.3-bin-hadoop3.2.tgz
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.3-bin-hadoop3.2"
!pip install -q findspark
import findspark

findspark.init()

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


- Прочитайте главы 4-6 из книги "Spark: The Definitive Guide".
- Загрузите датасет из предыдущей лабораторной работы:
  - https://www.kaggle.com/datasets/sveta151/tiktok-popular-songs-2022
- Выполните задания.


In [3]:
# Создаем SparkSession
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("SparkLab2").getOrCreate()

# Загружаем датасет.
tiktokData0 = (
    spark.read.option("inferSchema", "true")
    .option("header", "true")
    .csv("/content/drive/MyDrive/TikTok_songs_2022.csv")
)

№1: В столбце "loudness" переведите значения из дБ в проценты громкости. Выведите топ10 самых громких песен.

| track_name           | album                | artist_name         | loudness |
| -------------------- | -------------------- | ------------------- | -------- |
| Astronomia           | Astronomia           | Vicetone            | 55       |
| Sweater Weather      | I Love You.          | The Neighbourhood   | 52       |
| Dandelions           | Safe Haven           | Ruth B.             | 51       |
| 1, 2, 3 (feat. Ja... | 1, 2, 3 (feat. Ja... | Sofía Reyes         | 49       |
| Beauty And A Beat    | Beauty And A Beat    | Other               | 49       |
| Being Good Is Boring | Being Good Is Boring | Other               | 48       |
| Wellerman - Sea S... | Wellerman (Sea Sh... | Nathan Evans        | 47       |
| Thot Shit            | Thot Shit            | Megan Thee Stallion | 45       |
| Friday (feat. Muf... | Friday (feat. Muf... | Riton               | 45       |
| Hawái                | PAPI JUANCHO         | Maluma              | 45       |


In [4]:
from pyspark.sql.functions import desc
from pyspark.sql.functions import col
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType

tiktokData1 = tiktokData0.withColumn(
    "loudness", round((10 ** (tiktokData0.loudness / 10) * 100)).cast("int")
)

tiktokData1.sort(desc("loudness")).select(
    "track_name", "album", "artist_name", "loudness"
).show(10)

+--------------------+--------------------+-------------------+--------+
|          track_name|               album|        artist_name|loudness|
+--------------------+--------------------+-------------------+--------+
|          Astronomia|          Astronomia|           Vicetone|      55|
|     Sweater Weather|         I Love You.|  The Neighbourhood|      52|
|          Dandelions|          Safe Haven|            Ruth B.|      51|
|1, 2, 3 (feat. Ja...|1, 2, 3 (feat. Ja...|        Sofía Reyes|      49|
|   Beauty And A Beat|   Beauty And A Beat|             Glamii|      49|
|Being Good Is Boring|Being Good Is Boring|          Jena Rose|      48|
|Wellerman - Sea S...|Wellerman (Sea Sh...|       Nathan Evans|      47|
|           Thot Shit|           Thot Shit|Megan Thee Stallion|      45|
|Friday (feat. Muf...|Friday (feat. Muf...|              Riton|      45|
|               Hawái|        PAPI JUANCHO|             Maluma|      45|
+--------------------+--------------------+--------

№2: Получите имена соисполнителей из названий песен. Вынесите их в отдельный столбец. Удалите информацию о соисполнителях из названий треков.

| track_name                                      | artist_name       | feat                        |
| ----------------------------------------------- | ----------------- | --------------------------- |
| INDUSTRY BABY                                   | Lil Nas X         | Jack Harlow                 |
| Left and Right (Charlie Puth) - Sped Up Version | sped up nightcore | Jung Kook of BTS            |
| Bam Bam                                         | Camila Cabello    | Ed Sheeran                  |
| Down Under                                      | Luude             | Colin Hay                   |
| 1, 2, 3                                         | Sofía Reyes       | Jason Derulo & De La Ghetto |
| Chaa Chaa Chaa                                  | Girll Codee       | HoodCelebrityy              |

<div style="text-align: center"> only showing top 6 rows </div>


In [5]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import regexp_replace

tiktokData2 = tiktokData1.withColumn(
    "feat", regexp_extract(tiktokData1.track_name, r"feat. ([\w\s&]+)", 1)
).withColumn(
    "track_name", regexp_replace(tiktokData1.track_name, r" .feat. ([\w\s&]+).", "")
)

tiktokData2.select("track_name", "artist_name", "feat").where(
    tiktokData2.feat != ""
).show(6, False)

+-----------------------------------------------+-----------------+---------------------------+
|track_name                                     |artist_name      |feat                       |
+-----------------------------------------------+-----------------+---------------------------+
|INDUSTRY BABY                                  |Lil Nas X        |Jack Harlow                |
|Left and Right (Charlie Puth) - Sped Up Version|sped up nightcore|Jung Kook of BTS           |
|Bam Bam                                        |Camila Cabello   |Ed Sheeran                 |
|Down Under                                     |Luude            |Colin Hay                  |
|1, 2, 3                                        |Sofía Reyes      |Jason Derulo & De La Ghetto|
|Chaa Chaa Chaa                                 |Girll Codee      |HoodCelebrityy             |
+-----------------------------------------------+-----------------+---------------------------+
only showing top 6 rows



№3: Разделите строки с соисполнителями по символу '&'. Для каждой песни соберите всех исполнителей в один массив. Отсортируйте песни по количеству исполнителей и названиям песен.

| track_name                        | artist_names                              |
| --------------------------------- | ----------------------------------------- |
| 1, 2, 3                           | [Sofía Reyes, Jason Derulo, De La Ghetto] |
| For The Night                     | [Pop Smoke, Lil Baby, DaBaby]             |
| Friday - Dopamine Re-Edit         | [Riton, Mufasa, Hypeman]                  |
| Peaches                           | [Justin Bieber, Daniel Caesar, Giveon]    |
| Bam Bam                           | [Camila Cabello, Ed Sheeran]              |
| Banana - DJ FLe - Minisiren Remix | [Conkarah, Shaggy]                        |

<div style="text-align: center"> only showing top 6 rows </div>


In [6]:
from pyspark.sql.functions import split
from pyspark.sql.functions import array_union
from pyspark.sql.functions import array
from pyspark.sql.functions import size
from pyspark.sql.functions import when
from pyspark.sql.functions import filter


tiktokData3 = tiktokData2.withColumn(
    "artist_names",
    filter(
        array_union(
            split(tiktokData2.artist_name, " & "), split(tiktokData2.feat, " & ")
        ),
        lambda e: e != "",
    ),
)

tiktokData3.select("track_name", "artist_names").orderBy(
    desc(size(tiktokData3.artist_names)), tiktokData3.track_name
).show(6, False)

+-------------------------+-----------------------------------------+
|track_name               |artist_names                             |
+-------------------------+-----------------------------------------+
|1, 2, 3                  |[Sofía Reyes, Jason Derulo, De La Ghetto]|
|Don't Rush               |[Young T, Bugsey, Headie One]            |
|For The Night            |[Pop Smoke, Lil Baby, DaBaby]            |
|Friday - Dopamine Re-Edit|[Riton, Mufasa, Hypeman]                 |
|Peaches                  |[Justin Bieber, Daniel Caesar, Giveon]   |
|Bam Bam                  |[Camila Cabello, Ed Sheeran]             |
+-------------------------+-----------------------------------------+
only showing top 6 rows



№4: Выведите список песен Doja Cat.

| track_name   | album                | artist_names         |
| ------------ | -------------------- | -------------------- |
| Woman        | Planet Her           | [Doja Cat]           |
| Kiss Me More | Kiss Me More (fea... | [Doja Cat, SZA]      |
| Need to Know | Planet Her           | [Doja Cat]           |
| Ain't Shit   | Planet Her           | [Doja Cat]           |
| You Right    | Planet Her           | [Doja Cat]           |
| Best Friend  | Best Friend (feat... | [Saweetie, Doja Cat] |
| Freak        | Freak                | [Doja Cat]           |
| Boss Bitch   | Boss Bitch           | [Doja Cat]           |
| Say So       | Hot Pink             | [Doja Cat]           |


In [7]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import array_contains

tiktokData3.select("track_name", "album", "artist_names").where(
    array_contains(tiktokData3.artist_names, "Doja Cat")
).show(truncate=False)

tiktokData4 = tiktokData3.select(
    "track_name",
    explode(tiktokData3.artist_names).alias("artist_name"),
    "track_pop",
    "artist_pop",
)

+------------+---------------------------------------+--------------------+
|track_name  |album                                  |artist_names        |
+------------+---------------------------------------+--------------------+
|Woman       |Planet Her                             |[Doja Cat]          |
|Kiss Me More|Kiss Me More (feat. SZA)               |[Doja Cat, SZA]     |
|Need to Know|Planet Her                             |[Doja Cat]          |
|Ain't Shit  |Planet Her                             |[Doja Cat]          |
|You Right   |Planet Her                             |[Doja Cat]          |
|Best Friend |Best Friend (feat. Doja Cat) [Remix EP]|[Saweetie, Doja Cat]|
|Freak       |Freak                                  |[Doja Cat]          |
|Boss Bitch  |Boss Bitch                             |[Doja Cat]          |
|Say So      |Hot Pink                               |[Doja Cat]          |
+------------+---------------------------------------+--------------------+



№5: Выведите таблицу с исполнителями и количеством их треков в порядке уменьшения.

| artist_name    | count |
| -------------- | ----- |
| Doja Cat       | 9     |
| Coopex         | 6     |
| Dame Dame      | 5     |
| Alex Alexander | 4     |
| YES YES        | 4     |
| Lil Nas X      | 4     |

<div style="text-align: center"> only showing top 6 rows </div>


In [8]:
tiktokData4.groupBy("artist_name").count().orderBy(desc("count"), "artist_name").show(
    6, False
)

+--------------+-----+
|artist_name   |count|
+--------------+-----+
|Doja Cat      |9    |
|Coopex        |6    |
|Dame Dame     |5    |
|Alex Alexander|4    |
|DaBaby        |4    |
|Drake         |4    |
+--------------+-----+
only showing top 6 rows



№6: Замените имена всех исполнителей, популярность которых неизвестна или меньше 50, на "Other". Отсортируйте треки по количеству исполнителей и популярности.

| track_name                | artist_names                    | track_pop |
| ------------------------- | ------------------------------- | --------- |
| Peaches                   | [Justin Bieber, Other, Other]   | 86        |
| Friday - Dopamine Re-Edit | [Riton, Other, Other]           | 83        |
| 1, 2, 3                   | [Sofía Reyes, Other, Other]     | 82        |
| For The Night             | [Pop Smoke, Other, DaBaby]      | 80        |
| Jimmy Cooks               | [Drake, Other]                  | 92        |
| INDUSTRY BABY             | [Lil Nas X, Jack Harlow]        | 86        |
| Levitating                | [Dua Lipa, DaBaby]              | 85        |
| Bam Bam                   | [Camila Cabello, Ed Sheeran]    | 83        |
| Beautiful Mistakes        | [Maroon 5, Megan Thee Stallion] | 82        |
| Kiss Me More              | [Doja Cat, SZA]                 | 82        |

<div style="text-align: center"> only showing top 10 rows </div>


In [9]:
from pyspark.sql.functions import collect_list

tiktokData5 = tiktokData4.withColumn(
    "artist_name",
    when(
        tiktokData4.artist_name.isin(
            tiktokData0.select("artist_name")
            .where("artist_pop > 50")
            .rdd.map(lambda r: r.artist_name)
            .collect()
        ),
        tiktokData4.artist_name,
    ).otherwise("Other"),
)

tiktokData5.groupBy("track_name", "track_pop").agg(
    collect_list("artist_name").alias("artist_names")
).select("track_name", "artist_names", "track_pop").orderBy(
    desc(size("artist_names")), desc("track_pop")
).show(
    10, False
)

+-------------------------+-----------------------------------------+---------+
|track_name               |artist_names                             |track_pop|
+-------------------------+-----------------------------------------+---------+
|Peaches                  |[Justin Bieber, Daniel Caesar, Giveon]   |86       |
|Friday - Dopamine Re-Edit|[Riton, Mufasa, Hypeman]                 |83       |
|1, 2, 3                  |[Sofía Reyes, Jason Derulo, De La Ghetto]|82       |
|For The Night            |[Pop Smoke, Lil Baby, DaBaby]            |80       |
|Don't Rush               |[Young T, Bugsey, Headie One]            |63       |
|Jimmy Cooks              |[Drake, 21 Savage]                       |92       |
|INDUSTRY BABY            |[Lil Nas X, Jack Harlow]                 |86       |
|Levitating               |[Dua Lipa, DaBaby]                       |85       |
|Bam Bam                  |[Camila Cabello, Ed Sheeran]             |83       |
|Kiss Me More             |[Doja Cat, SZ