# CSE514 - Project
## Data Night at the MET



### Setup

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=e8b9681cbedbc8c1cbd1dde7ad19f29886e3d48630fcab6ed29144b4d61a7f63
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fon

Now we authenticate a Google Drive client to download the file we will be processing in our Spark job.

**Make sure to follow the interactive instructions.**

In [3]:
import data_cleaning
from importlib import reload

reload(data_cleaning)

<module 'data_cleaning' from '/content/data_cleaning.py'>

In [4]:
artwork, artist, medium = data_cleaning.load_data()

  0%|          | 0/477804 [00:00<?, ?it/s]

  0%|          | 0/477804 [00:00<?, ?it/s]

In [5]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, ArrayType

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Read Online CSV") \
    .getOrCreate()

In [6]:
artwork_schema = StructType([
    StructField("Object ID", IntegerType(), nullable=False),
    StructField("Object Name", StringType(), nullable=True),
    StructField("Is Highlight", BooleanType(), nullable=True),
    StructField("Country", StringType(), nullable=True),
    StructField("Period", StringType(), nullable=True),
    StructField("Culture", StringType(), nullable=True),
    StructField("Gallery Number", IntegerType(), nullable=True),
    StructField("Department", StringType(), nullable=True),
    StructField("Medium IDs", ArrayType(IntegerType()), nullable=True),
    StructField("Artist IDs", ArrayType(IntegerType()), nullable=True)
])

artist_schema = StructType([
    StructField("ID", IntegerType(), nullable=False),
    StructField("Name", StringType(), nullable=True),
    StructField("Gender", StringType(), nullable=True)
])
medium_schema = StructType([
    StructField("ID", IntegerType(), nullable=False),
    StructField("Material", StringType(), nullable=True)
])

In [7]:
artwork = artwork[['Object ID', 'Object Name', 'Is Highlight', 'Country', 'Period', 'Culture', 'Gallery Number', 'Department', 'Medium IDs', 'Artist IDs']]
medium = medium[['ID', 'Material']]
artist = artist[['ID', 'Name', 'Gender']]

# deal with missing values so pyspark can read them
n_artwork = artwork.copy()
n_artwork['Gallery Number'] = artwork['Gallery Number'].fillna(np.nan).replace([np.nan], [None])
n_artwork['Artist IDs'] = artwork['Artist IDs'].fillna(np.nan).replace([np.nan],[None])
n_artwork['Medium IDs'] = artwork['Medium IDs'].fillna(np.nan).replace([np.nan],[None])


In [8]:
# Create PySpark Tables
df_artwork = spark.createDataFrame(data=n_artwork, schema=artwork_schema)
df_artist = spark.createDataFrame(data=artist, schema=artist_schema)
df_medium = spark.createDataFrame(data=medium, schema=medium_schema)

df_artwork.createOrReplaceTempView("artwork")
df_medium.createOrReplaceTempView("medium")
df_artist.createOrReplaceTempView('artist')

# Query 6
Of the ten countries from which art is most frequently acquired, which artists are most common?


In [None]:
# Query 6 needs Query 1 to run so putting it here 

query1 = """
SELECT Country, COUNT(*) AS ArtworkCount
FROM artwork
WHERE Country IS NOT NULL
GROUP BY Country
ORDER BY ArtworkCount DESC
"""

result1 = spark.sql(query1)
result1.show()

In [108]:
from pyspark.sql.functions import explode
df_artwork_exploded = df_artwork.withColumn("Artist IDs", explode(df_artwork['Artist IDs']))

result1.createOrReplaceTempView("result1")
df_artwork_exploded.createOrReplaceTempView("artwork_exploded")

query6 = """
SELECT artist.Name as ArtistName, TopArtists.ArtworkCount FROM 
(SELECT `Artist IDs` as `Artist ID`, COUNT(*) as ArtworkCount FROM artwork_exploded
WHERE Country IN (SELECT Country FROM result1 ORDER BY ArtworkCount DESC LIMIT 5)
GROUP BY `Artist IDs`
ORDER BY ArtworkCount DESC
LIMIT 10) as TopArtists, artist
WHERE TopArtists.`Artist ID` = artist.ID
ORDER BY TopArtists.ArtworkCount DESC
"""

result6 = spark.sql(query6)
result6.show()

+--------------------+------------+
|          ArtistName|ArtworkCount|
+--------------------+------------+
|Louis Comfort Tif...|         621|
|     Tiffany Studios|         537|
|Tiffany Glass and...|         364|
|Tiffany Glass Com...|         199|
|Union Porcelain W...|         184|
|Abu'l Qasim Firdausi|         162|
|Nina de Garis Davies|         155|
|Boston & Sandwich...|         147|
|      Walter Tyndale|         134|
|       Tiffany & Co.|         131|
+--------------------+------------+



# Query 7
Are artworks from specific time periods more likely to be highlighted (displayed) than others?

In [137]:
query7 = """
SELECT Period, 
  (SUM(CASE WHEN `Is Highlight` = TRUE THEN 1 ELSE 0 END) / COUNT(*)) * 100  AS PercentHighlighted
FROM artwork
WHERE Period IS NOT NULL
GROUP BY Period
ORDER BY PercentHighlighted DESC
"""

result7 = spark.sql(query7)
result7.show()

+--------------------+------------------+
|              Period|PercentHighlighted|
+--------------------+------------------+
|      Solanki period|             100.0|
|Early Tokugawa pe...|             100.0|
|      Timurid period|             100.0|
|Late Early Cyclad...|             100.0|
|  Parthian or Kushan|             100.0|
|     Final Neolithic|             100.0|
|      Pandyan period|             100.0|
|Shang dynasty–Wes...|             100.0|
|            Tokugawa|             100.0|
|Third Intermediat...|             100.0|
|Late Period or Ea...|             100.0|
|early Ptolemaic P...|             100.0|
|late Anuradhapura...|              50.0|
|  Hellenistic period|              50.0|
|early Eastern Jav...|              50.0|
|  Late Helladic IIIC|              50.0|
|        Ming Dynasty|              40.0|
|Early Cycladic II...| 33.33333333333333|
|             Severan| 28.57142857142857|
|            Augustan|              25.0|
+--------------------+------------