In [231]:
from tokenize import Number
import pyspark
import pyspark.sql.functions
from pyspark.sql.functions import desc
from pyspark.sql.functions import col
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import regexp_replace
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import array_union
from pyspark.sql.functions import array
from pyspark.sql.functions import size
from pyspark.sql.functions import when


spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkLab2') \
                    .getOrCreate()

tiktokData2022 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("/home/student/Desktop/TikTok_songs_2022.csv")

tiktokData2022.createOrReplaceTempView("TikTok_songs_2022")

In [279]:
dataFrameWay = tiktokData2022\
  .select("track_name","album","artist_name",round(10 ** (col("loudness") * 0.1) * 100).alias("loudness"))\
  .orderBy(desc("loudness"))\

dataFrameWay.show(10)

+--------------------+--------------------+-------------------+--------+
|          track_name|               album|        artist_name|loudness|
+--------------------+--------------------+-------------------+--------+
|          Astronomia|          Astronomia|           Vicetone|    55.0|
|     Sweater Weather|         I Love You.|  The Neighbourhood|    52.0|
|          Dandelions|          Safe Haven|            Ruth B.|    51.0|
|1, 2, 3 (feat. Ja...|1, 2, 3 (feat. Ja...|        Sofía Reyes|    49.0|
|   Beauty And A Beat|   Beauty And A Beat|             Glamii|    49.0|
|Being Good Is Boring|Being Good Is Boring|          Jena Rose|    48.0|
|Wellerman - Sea S...|Wellerman (Sea Sh...|       Nathan Evans|    47.0|
|           Thot Shit|           Thot Shit|Megan Thee Stallion|    45.0|
|Friday (feat. Muf...|Friday (feat. Muf...|              Riton|    45.0|
|               Hawái|        PAPI JUANCHO|             Maluma|    45.0|
+--------------------+--------------------+--------

In [281]:
dataFrameWay1 = tiktokData2022\
  .select(regexp_replace('track_name', '[(\[]feat. [^)]*[\)]', "").alias('track_names'), 'artist_name' ,regexp_replace('track_name', '(([^\$]*)feat. )', "").alias('feat'))\
  .select('track_names', 'artist_name',regexp_replace('feat', '([\]\)])+([^\$]*)', "").alias('feat'))\
  .where(col("track_name").like("%feat.%"))

dataFrameWay1.show(1000)

+--------------------+-----------------+--------------------+
|         track_names|      artist_name|                feat|
+--------------------+-----------------+--------------------+
|      INDUSTRY BABY |        Lil Nas X|         Jack Harlow|
|Left and Right (C...|sped up nightcore|    Jung Kook of BTS|
|            Bam Bam |   Camila Cabello|          Ed Sheeran|
|         Down Under |            Luude|           Colin Hay|
|            1, 2, 3 |      Sofía Reyes|Jason Derulo & De...|
|     Chaa Chaa Chaa |      Girll Codee|      HoodCelebrityy|
|     Running To You |            VINAI|               Caden|
|SAD GIRLZ LUV MON...|          Amaarae|Kali Uchis and Moliy|
|            edamame |            bbno$|          Rich Brian|
|       Kiss Me More |         Doja Cat|                 SZA|
| Beautiful Mistakes |         Maroon 5| Megan Thee Stallion|
|            Peaches |    Justin Bieber|Daniel Caesar & G...|
|         Levitating |         Dua Lipa|              DaBaby|
|       

In [276]:
dataFrameWay2 = tiktokData2022\

dataFrameWay2 = dataFrameWay1.select('track_names', array_union(split('artist_name', '&'), split('feat', '&')).alias('artists_name')) \
  .orderBy(desc(size('artists_name')), 'track_names')\
  
dataFrameWay2.show(1000)

+--------------------+--------------------+
|         track_names|        artists_name|
+--------------------+--------------------+
|            1, 2, 3 |[Sofía Reyes, Jas...|
|         Don't Rush |[Young T ,  Bugse...|
|      For The Night |[Pop Smoke, Lil B...|
|Friday  - Dopamin...|[Riton, Mufasa , ...|
|            Peaches |[Justin Bieber, D...|
|            Bam Bam |[Camila Cabello, ...|
|Banana  - DJ FLe ...|  [Conkarah, Shaggy]|
| Beautiful Mistakes |[Maroon 5, Megan ...|
|        Best Friend |[Saweetie, Doja Cat]|
|            Bundles |[Kayla Nicole, Ta...|
|     Chaa Chaa Chaa |[Girll Codee, Hoo...|
|         Down Under |  [Luude, Colin Hay]|
|    FLIP THE SWITCH |      [Quavo, Drake]|
|Girls Like You  -...| [Maroon 5, Cardi B]|
|            Gravity |[Brent Faiyaz, Ty...|
|      INDUSTRY BABY |[Lil Nas X, Jack ...|
|         Intentions |[Justin Bieber, Q...|
|        Jimmy Cooks |  [Drake, 21 Savage]|
|       Kiss Me More |     [Doja Cat, SZA]|
|Left and Right (C...|[sped up n

In [575]:
from pyspark.sql.functions import explode

dataFrameWay3 = tiktokData2022\
.select('album', 'track_pop', 'artist_pop',regexp_replace( 'track_name', '[(\[]feat. [^)]*[\)]', "").alias('track_names'), 'artist_name' ,regexp_replace('track_name', '(([^\$]*)feat. )', "").alias('feat'))\
.select('track_names','track_pop','artist_pop','album','artist_name',regexp_replace('feat', '([\]\)])+([^\$]*)', "").alias('feat'))\
.select('track_names','track_pop','artist_pop','album','artist_name',array_union(split('artist_name', '&'), split('feat', '&')).alias('artists_name'))

dataFrameWay3

dataFrameWay4 = tiktokData2022\

dataFrameWay4 = dataFrameWay3.select('track_names',"album",'artists_name',explode('artists_name').alias('artists'))\
.where(col('artists').like('%Doja Cat%'))

dataFrameWay4.show(6)

+-------------+--------------------+--------------------+--------+
|  track_names|               album|        artists_name| artists|
+-------------+--------------------+--------------------+--------+
|        Woman|          Planet Her|   [Doja Cat, Woman]|Doja Cat|
|Kiss Me More |Kiss Me More (fea...|     [Doja Cat, SZA]|Doja Cat|
| Need to Know|          Planet Her|[Doja Cat, Need t...|Doja Cat|
|   Ain't Shit|          Planet Her|[Doja Cat, Ain't ...|Doja Cat|
|    You Right|          Planet Her|[Doja Cat, You Ri...|Doja Cat|
| Best Friend |Best Friend (feat...|[Saweetie, Doja Cat]|Doja Cat|
+-------------+--------------------+--------------------+--------+
only showing top 6 rows



In [353]:
from pyspark.sql.functions import explode

dataFrameWay5 = tiktokData2022\

dataFrameWay5 = dataFrameWay3.select(explode('artists_name').alias('artists'))\
.groupBy("artists")\
.agg(count("artists").alias("count"))\
.sort(desc("count"))

dataFrameWay5.show(6)

+-------------------+-----+
|            artists|count|
+-------------------+-----+
|           Doja Cat|    9|
|             Coopex|    6|
|          Dame Dame|    5|
|Megan Thee Stallion|    4|
|            YES YES|    4|
|          Lil Nas X|    4|
+-------------------+-----+
only showing top 6 rows



In [600]:

dataFrameWay6 = dataFrameWay3\

dataFrameWay6 = dataFrameWay3.select('track_names','artists_name','track_pop')\
.where('track_pop >= 50')\
.withColumn('track_pops', col('track_pop'))

dataFrameWay7 = dataFrameWay3.select('track_names','artists_name','track_pop')\
.where('track_pop < 50')\
.withColumn('track_pops', when(col('track_pop') < 50 , 'other'))

df67 = dataFrameWay6.unionByName(dataFrameWay7)

df67.sort(desc(size('artists_name')), desc('track_pop')).select('track_names','artists_name','track_pops').show(10)


+--------------------+--------------------+----------+
|         track_names|        artists_name|track_pops|
+--------------------+--------------------+----------+
|            Peaches |[Justin Bieber, D...|        86|
|Friday  - Dopamin...|[Riton, Mufasa , ...|        83|
|            1, 2, 3 |[Sofía Reyes, Jas...|        82|
|       Jiggle Jiggle|[Duke ,  Jones, J...|        81|
|      For The Night |[Pop Smoke, Lil B...|        80|
|         Don't Rush |[Young T ,  Bugse...|        63|
|         Wild & Free|[Nander, Wild ,  ...|     other|
|             We Know|[JAM ,  Philly, W...|     other|
|       Fever - Remix|[XY, O, Fever - R...|     other|
|    Looking for Love|[Asketa ,  Natan ...|     other|
+--------------------+--------------------+----------+
only showing top 10 rows

