In [1]:
import pyspark as ps
import pyspark.sql.functions as f
from pyspark import SQLContext
from pyspark.sql.types import IntegerType, DateType, TimestampType
from datetime import datetime

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("sparkSQL exercise") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [2]:
sc

In [3]:
sqlContext = SQLContext(sc)

In [4]:
df = sqlContext.read.csv("uk100.csv", header=True)

In [5]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- last_week_rank: string (nullable = true)
 |-- hmm: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- label: string (nullable = true)
 |-- peak_rank: string (nullable = true)
 |-- weeks_on_chart: string (nullable = true)
 |-- week_of: string (nullable = true)



In [6]:
df.createOrReplaceTempView('test')

In [7]:
result = spark.sql('''SELECT *
            FROM test
            LIMIT 30
            ''')
result.show()

+---+----+--------------+--------+--------------------+--------------------+--------------------+---------+--------------+----------------+
|_c0|rank|last_week_rank|     hmm|               title|              artist|               label|peak_rank|weeks_on_chart|         week_of|
+---+----+--------------+--------+--------------------+--------------------+--------------------+---------+--------------+----------------+
|  0|   1|            11|        |               RIVER|      ELLIE GOULDING|             POLYDOR|        1|             5|December 27 2019|
|  1|   2|             8|        |ALL I WANT FOR CH...|        MARIAH CAREY|            COLUMBIA|        2|            99|December 27 2019|
|  2|   3|             5|        |      LAST CHRISTMAS|                WHAM|                 RCA|        2|            64|December 27 2019|
|  3|   4|            14|        |FAIRYTALE OF NEW ...|POGUES FT KIRSTY ...|         WARNER BROS|        2|            99|December 27 2019|
|  4|   5|          

In [8]:
df = df.drop('_c0')
df = df.drop('hmm')

In [9]:
for col in df.columns:
    df = df.withColumn(col, f.lower(f.col(col)))

In [10]:
df = df.withColumn("rank", df["rank"].cast(IntegerType()))
df = df.withColumn("peak_rank", df["peak_rank"].cast(IntegerType()))
df = df.withColumn("weeks_on_chart", df["weeks_on_chart"].cast(IntegerType()))
#df = df.withColumn("week_of", df["week_of"].cast(DateType()))

In [11]:
def to_date(x):
    return datetime.strptime(x, '%B %d %Y')
hmm = f.udf(lambda y: to_date(y), DateType())


In [12]:
df = df.withColumn("week_of", hmm('week_of'))

In [13]:
df.printSchema()

root
 |-- rank: integer (nullable = true)
 |-- last_week_rank: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- label: string (nullable = true)
 |-- peak_rank: integer (nullable = true)
 |-- weeks_on_chart: integer (nullable = true)
 |-- week_of: date (nullable = true)



In [14]:
df.createOrReplaceTempView('test')

In [15]:
result = spark.sql('''SELECT COUNT(*)
            FROM test
            ''')
result.show()

+--------+
|count(1)|
+--------+
|   26000|
+--------+



In [16]:
result = spark.sql('''SELECT *
            FROM test
            LIMIT 30
            ''')
result.show()

+----+--------------+--------------------+--------------------+--------------------+---------+--------------+----------+
|rank|last_week_rank|               title|              artist|               label|peak_rank|weeks_on_chart|   week_of|
+----+--------------+--------------------+--------------------+--------------------+---------+--------------+----------+
|   1|            11|               river|      ellie goulding|             polydor|        1|             5|2019-12-27|
|   2|             8|all i want for ch...|        mariah carey|            columbia|        2|            99|2019-12-27|
|   3|             5|      last christmas|                wham|                 rca|        2|            64|2019-12-27|
|   4|            14|fairytale of new ...|pogues ft kirsty ...|         warner bros|        2|            99|2019-12-27|
|   5|             2|              own it|stormzy/ed sheera...|      atlantic/merky|        2|             5|2019-12-27|
|   6|            16|merry chris

In [17]:
result = spark.sql('''
            SELECT DISTINCT title, ROUND(AVG(rank), 2)
            FROM test
            WHERE title like "%christmas%"
                OR title like "%merry%"
                OR title like "%xmas%"
                OR title like "%santa%"
                OR title like "%rudolph%"
                OR title like "%reindeer%"
            GROUP BY 1
            ORDER BY 2
            ''')
result.show(50, False)

+----------------------------------------+-----------------------------------+
|title                                   |round(avg(CAST(rank AS BIGINT)), 2)|
+----------------------------------------+-----------------------------------+
|last christmas                          |25.3                               |
|all i want for christmas is you         |26.82                              |
|happy christmas (war is over)           |28.0                               |
|merry christmas everyone                |34.46                              |
|do they know it's christmas             |35.7                               |
|i wish it could be christmas everyday   |36.0                               |
|rockin' around the christmas tree       |37.05                              |
|step into christmas                     |38.84                              |
|driving home for christmas              |39.37                              |
|santa tell me                           |41.0      

In [18]:
result = spark.sql('''
            SELECT COUNT(DISTINCT title)
            FROM test
            WHERE rank == 1''')
result.show()

+---------------------+
|count(DISTINCT title)|
+---------------------+
|                   77|
+---------------------+



In [19]:
result = spark.sql('''
            SELECT DISTINCT title, MAX(weeks_on_chart) as max_weeks, MIN(week_of)
            FROM test
            WHERE peak_rank == 1
            GROUP BY 1
            ORDER BY 2 DESC
            ''')
result.show()

+--------------------+---------+------------+
|               title|max_weeks|min(week_of)|
+--------------------+---------+------------+
|   thinking out loud|      118|  2015-01-11|
|             perfect|      110|  2017-12-08|
|merry xmas everybody|       98|  2015-12-11|
|        shape of you|       97|  2017-01-13|
|               happy|       92|  2015-01-11|
|           rather be|       90|  2015-01-11|
|             shotgun|       87|  2018-06-29|
|            titanium|       87|  2015-08-14|
|do they know it's...|       80|  2015-12-11|
|    someone like you|       78|  2015-10-30|
|      counting stars|       77|  2015-01-11|
|         uptown funk|       76|  2015-01-11|
|   despacito (remix)|       74|  2017-05-12|
|        stay with me|       74|  2015-01-11|
|          wake me up|       72|  2015-01-25|
|merry christmas e...|       71|  2015-12-04|
|               waves|       70|  2015-01-11|
|               sorry|       68|  2015-11-20|
|        viva la vida|       68|  

In [20]:
result = spark.sql('''
            SELECT AVG(max_weeks)
            FROM (SELECT DISTINCT title, MAX(weeks_on_chart) as max_weeks, MIN(week_of)
                    FROM test
                    WHERE peak_rank == 1
                    GROUP BY 1
                    ORDER BY 2 DESC)
            ''')
result.show()

+-----------------+
|   avg(max_weeks)|
+-----------------+
|41.92125984251968|
+-----------------+



In [21]:
result = spark.sql('''
            SELECT AVG(rank), MAX(weeks_on_chart)
            FROM test
            WHERE title = "shape of you"
            ''')
result.show()

+-----------------+-------------------+
|        avg(rank)|max(weeks_on_chart)|
+-----------------+-------------------+
|42.51546391752577|                 97|
+-----------------+-------------------+



In [22]:
result = spark.sql('''
            SELECT artist, title, COUNT(*)
            FROM test
            GROUP BY artist, title
            ORDER BY 3 DESC
            ''')
result.show()

+--------------------+--------------------+--------+
|              artist|               title|count(1)|
+--------------------+--------------------+--------+
|          ed sheeran|             perfect|     110|
|             killers|       mr brightside|     105|
|          ed sheeran|        shape of you|      97|
|          ed sheeran|   thinking out loud|      90|
|         george ezra|             shotgun|      87|
|                 sia|          chandelier|      87|
|major lazer ft mo...|             lean on|      79|
|           james bay|           let it go|      77|
|settle/greatest s...|          this is me|      75|
|          ed sheeran|          photograph|      73|
|       walk the moon|     shut up & dance|      72|
|mark ronson ft br...|         uptown funk|      72|
|   justin timberlake|can't stop the fe...|      71|
|luis fonsi/daddy ...|   despacito (remix)|      71|
|           james bay| hold back the river|      68|
|       justin bieber|               sorry|   

In [23]:
result = spark.sql('''
            SELECT artist, title, count(*)
            FROM test
            WHERE title = "perfect"
            GROUP BY artist, title
            ''')
result.show()

+-------------+-------+--------+
|       artist|  title|count(1)|
+-------------+-------+--------+
|one direction|perfect|      21|
|   ed sheeran|perfect|     110|
+-------------+-------+--------+



In [29]:
result = spark.sql('''
            SELECT artist, title, label, count(*) as count
            FROM test
            GROUP BY artist, title, label
            ORDER BY count DESC
            ''')
result.show()

+--------------------+--------------------+--------------------+-----+
|              artist|               title|               label|count|
+--------------------+--------------------+--------------------+-----+
|          ed sheeran|             perfect|              asylum|  110|
|             killers|       mr brightside|             mercury|  105|
|          ed sheeran|        shape of you|              asylum|   97|
|          ed sheeran|   thinking out loud|              asylum|   90|
|                 sia|          chandelier|   monkey puzzle/rca|   87|
|         george ezra|             shotgun|            columbia|   87|
|major lazer ft mo...|             lean on|       because music|   79|
|           james bay|           let it go|              virgin|   77|
|settle/greatest s...|          this is me|            atlantic|   75|
|          ed sheeran|          photograph|              asylum|   73|
|mark ronson ft br...|         uptown funk|            columbia|   72|
|     

In [31]:
result = spark.sql('''
            SELECT label, COUNT(*)
            FROM test
            GROUP BY 1
            ORDER BY 2 DESC
            ''')
result.show()

+--------------------+--------+
|               label|count(1)|
+--------------------+--------+
|            atlantic|    2055|
|            columbia|    1613|
|          interscope|    1493|
|             polydor|    1220|
|                 rca|    1099|
|              virgin|     924|
|              asylum|     911|
|                 emi|     830|
|          syco music|     821|
|    republic records|     817|
|          parlophone|     791|
|         warner bros|     653|
|              island|     650|
|cash money/republ...|     650|
|             capitol|     612|
|             def jam|     519|
|                epic|     471|
|   ministry of sound|     466|
|            positiva|     413|
|          relentless|     360|
+--------------------+--------+
only showing top 20 rows



In [32]:
result = spark.sql('''
            SELECT COUNT(DISTINCT label)
            FROM test
            ''')
result.show()

+---------------------+
|count(DISTINCT label)|
+---------------------+
|                  408|
+---------------------+



In [33]:
result = spark.sql('''
            SELECT label, COUNT(*)
            FROM test
            GROUP BY 1
            ORDER BY 2 DESC
            ''')
result.show()

+--------------------+--------+
|               label|count(1)|
+--------------------+--------+
|            atlantic|    2055|
|            columbia|    1613|
|          interscope|    1493|
|             polydor|    1220|
|                 rca|    1099|
|              virgin|     924|
|              asylum|     911|
|                 emi|     830|
|          syco music|     821|
|    republic records|     817|
|          parlophone|     791|
|         warner bros|     653|
|              island|     650|
|cash money/republ...|     650|
|             capitol|     612|
|             def jam|     519|
|                epic|     471|
|   ministry of sound|     466|
|            positiva|     413|
|          relentless|     360|
+--------------------+--------+
only showing top 20 rows

