In [1]:
from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/10/26 15:16:03 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:16:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:16:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [2]:
spark

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
# see here for more info on the schema: https://spark.apache.org/docs/latest/sql-programming-guide.html#inferring-the-schema-using-reflection
# and here https://sparkbyexamples.com/pyspark/pyspark-sql-types-datatype-with-examples/

schema = StructType([
    StructField("movieid", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("date", StringType(), True),
    StructField("unkown", StringType(), True),
    StructField("url", StringType(), True),
    ])

movies = spark.read.csv('data/u.item', header=False, schema=schema,  sep = '|')

# display the first 5 rows of the dataframe
movies.show(5)

+-------+-----------------+-----------+------+--------------------+
|movieid|            title|       date|unkown|                 url|
+-------+-----------------+-----------+------+--------------------+
|      1| Toy Story (1995)|01-Jan-1995|  NULL|http://us.imdb.co...|
|      2| GoldenEye (1995)|01-Jan-1995|  NULL|http://us.imdb.co...|
|      3|Four Rooms (1995)|01-Jan-1995|  NULL|http://us.imdb.co...|
|      4|Get Shorty (1995)|01-Jan-1995|  NULL|http://us.imdb.co...|
|      5|   Copycat (1995)|01-Jan-1995|  NULL|http://us.imdb.co...|
+-------+-----------------+-----------+------+--------------------+
only showing top 5 rows



In [4]:
movies.createOrReplaceTempView("movies_tmp")

In [5]:
%load_ext sparksql_magic

In [6]:
# demonstration, note that when using sparksql, we can save the results in a temporary view
# but this (and other sparksql switches) will not work with VSCode. It will work in Jupyter Notebook.
# %%sparksql --view tempdf
# select * from movies_tmp limit 10

In [7]:
# We can use sparksql to show current tables, but this will only work in Jupyter Notebook. It will 
# not work in VSCode.
#%%sparksql 
#SHOW TABLES

In [8]:
spark_df = spark.sql("""SELECT
  movieid,
  title
FROM movies_tmp"""
)
spark_df.show()

+-------+--------------------+
|movieid|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|    GoldenEye (1995)|
|      3|   Four Rooms (1995)|
|      4|   Get Shorty (1995)|
|      5|      Copycat (1995)|
|      6|Shanghai Triad (Y...|
|      7|Twelve Monkeys (1...|
|      8|         Babe (1995)|
|      9|Dead Man Walking ...|
|     10|  Richard III (1995)|
|     11|Seven (Se7en) (1995)|
|     12|Usual Suspects, T...|
|     13|Mighty Aphrodite ...|
|     14|  Postino, Il (1994)|
|     15|Mr. Holland's Opu...|
|     16|French Twist (Gaz...|
|     17|From Dusk Till Da...|
|     18|White Balloon, Th...|
|     19|Antonia's Line (1...|
|     20|Angels and Insect...|
+-------+--------------------+
only showing top 20 rows



In [9]:
spark_df.write.saveAsTable("movies", mode='overwrite')

23/10/26 15:16:11 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/10/26 15:16:11 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/10/26 15:16:13 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/10/26 15:16:13 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore student@127.0.0.1
23/10/26 15:16:14 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/10/26 15:16:15 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/10/26 15:16:16 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/10/26 15:16:16 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/10/26 15:16:16 WA

In [10]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
# see here for more info on the schema: https://spark.apache.org/docs/latest/sql-programming-guide.html#inferring-the-schema-using-reflection
# and here https://sparkbyexamples.com/pyspark/pyspark-sql-types-datatype-with-examples/

schema = StructType([
    StructField("userid", IntegerType(), True),
    StructField("movieid", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    ])

movierating = spark.read.csv('data/u.data', header=False, schema=schema,  sep = '\t')

# display the first 5 rows of the dataframe
movierating.show(5)

+------+-------+------+---------+
|userid|movieid|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows



In [11]:
movierating.write.saveAsTable("movieratings", mode='overwrite')

In [12]:
%%sparksql
select * from movieratings limit 10

0,1,2,3
userid,movieid,rating,timestamp
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596
298,474,4,884182806
115,265,2,881171488
253,465,5,891628467
305,451,3,886324817


In [13]:
dfRating = spark.table('movieratings')
dfMovies = spark.table('movies')

In [14]:
# for more on colaborative filtering, see here https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
# 
from pyspark.ml.recommendation import ALS
 
#split training and testing
(dftraining, dftest) = dfRating.randomSplit([0.8, 0.2])
 
## Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userid", 
    itemCol="movieid", ratingCol="rating",
    coldStartStrategy="drop")
model = als.fit(dftraining)
 
#display predicted rating
predictions = model.transform(dftest)
predictions.show()
 

                                                                                

+------+-------+------+---------+-------------+
|userid|movieid|rating|timestamp|   prediction|
+------+-------+------+---------+-------------+
|   148|      1|     4|877019411|     5.108429|
|   148|      8|     4|877020297|     5.034416|
|   148|     50|     5|877016805|    4.3456655|
|   148|     69|     5|877019101|     2.838674|
|   148|     89|     5|877398587|    5.0523868|
|   148|    151|     4|877400124|     4.194229|
|   148|    181|     5|877399135|     4.172007|
|   148|    191|     1|877020715|     3.891063|
|   148|    194|     5|877015066|    4.3056793|
|   148|    222|     4|877398901|    3.3137639|
|   148|    228|     4|877016514|     6.162162|
|   148|    432|     5|877019698|    3.6224227|
|   148|    495|     4|877016735|    1.8733618|
|   148|    549|     3|877398385|     4.830123|
|   148|    596|     5|877020297|     5.160871|
|   148|    663|     5|877399018|    4.3450947|
|   148|   1012|     4|877400154|-0.0029671788|
|   463|      1|     1|890453075|    2.9

In [15]:
spark.stop()