In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession;

spark = SparkSession.builder.master("local[4]").appName("ISM6562 Spark App01").enableHiveSupport().getOrCreate();

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/04/10 07:02:42 WARN Utils: Your hostname, MBP-SMITH515.local resolves to a loopback address: 127.0.0.1; using 192.168.4.81 instead (on interface en0)
23/04/10 07:02:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/10 07:02:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark Session WebUI Port: 4040


In [2]:
# this will set the log level to ERROR. This will hide the INFO or WARNING messages that are printed out by default. If you want to see them, set this to INFO or WARN.
sc.setLogLevel("ERROR")

In [3]:
spark

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
# see here for more info on the schema: https://spark.apache.org/docs/latest/sql-programming-guide.html#inferring-the-schema-using-reflection
# and here https://sparkbyexamples.com/pyspark/pyspark-sql-types-datatype-with-examples/

schema = StructType([
    StructField("movieid", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("date", StringType(), True),
    StructField("unkown", StringType(), True),
    StructField("url", StringType(), True),
    ])

movies = spark.read.csv('data/u.item', header=False, schema=schema,  sep = '|')

# display the first 5 rows of the dataframe
movies.show(5)

+-------+-----------------+-----------+------+--------------------+
|movieid|            title|       date|unkown|                 url|
+-------+-----------------+-----------+------+--------------------+
|      1| Toy Story (1995)|01-Jan-1995|  null|http://us.imdb.co...|
|      2| GoldenEye (1995)|01-Jan-1995|  null|http://us.imdb.co...|
|      3|Four Rooms (1995)|01-Jan-1995|  null|http://us.imdb.co...|
|      4|Get Shorty (1995)|01-Jan-1995|  null|http://us.imdb.co...|
|      5|   Copycat (1995)|01-Jan-1995|  null|http://us.imdb.co...|
+-------+-----------------+-----------+------+--------------------+
only showing top 5 rows



In [5]:
movies.createOrReplaceTempView("movies_tmp")

In [6]:
%load_ext sparksql_magic

In [7]:
# demonstration, note that when using sparksql, we can save the results in a temporary view
# but this (and other sparksql switches) will not work with VSCode. It will work in Jupyter Notebook.
# %%sparksql --view tempdf
# select * from movies_tmp limit 10

In [8]:
# We can use sparksql to show current tables, but this will only work in Jupyter Notebook. It will 
# not work in VSCode.
#%%sparksql 
#SHOW TABLES

In [9]:
spark_df = spark.sql("""SELECT
  movieid,
  title
FROM movies_tmp"""
)
spark_df.show()

+-------+--------------------+
|movieid|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|    GoldenEye (1995)|
|      3|   Four Rooms (1995)|
|      4|   Get Shorty (1995)|
|      5|      Copycat (1995)|
|      6|Shanghai Triad (Y...|
|      7|Twelve Monkeys (1...|
|      8|         Babe (1995)|
|      9|Dead Man Walking ...|
|     10|  Richard III (1995)|
|     11|Seven (Se7en) (1995)|
|     12|Usual Suspects, T...|
|     13|Mighty Aphrodite ...|
|     14|  Postino, Il (1994)|
|     15|Mr. Holland's Opu...|
|     16|French Twist (Gaz...|
|     17|From Dusk Till Da...|
|     18|White Balloon, Th...|
|     19|Antonia's Line (1...|
|     20|Angels and Insect...|
+-------+--------------------+
only showing top 20 rows



In [10]:
spark_df.write.saveAsTable("movies", mode='overwrite')

In [11]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
# see here for more info on the schema: https://spark.apache.org/docs/latest/sql-programming-guide.html#inferring-the-schema-using-reflection
# and here https://sparkbyexamples.com/pyspark/pyspark-sql-types-datatype-with-examples/

schema = StructType([
    StructField("userid", IntegerType(), True),
    StructField("movieid", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    ])

movierating = spark.read.csv('data/u.data', header=False, schema=schema,  sep = '\t')

# display the first 5 rows of the dataframe
movierating.show(5)

+------+-------+------+---------+
|userid|movieid|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows



In [12]:
movierating.write.saveAsTable("movieratings", mode='overwrite')

In [13]:
%%sparksql
select * from movieratings limit 10

0,1,2,3
userid,movieid,rating,timestamp
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596
298,474,4,884182806
115,265,2,881171488
253,465,5,891628467
305,451,3,886324817


In [14]:
dfRating = spark.table('movieratings')
dfMovies = spark.table('movies')

In [15]:
# for more on colaborative filtering, see here https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
# 
from pyspark.ml.recommendation import ALS
 
#split training and testing
(dftraining, dftest) = dfRating.randomSplit([0.8, 0.2])
 
## Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userid", 
    itemCol="movieid", ratingCol="rating",
    coldStartStrategy="drop")
model = als.fit(dftraining)
 
#display predicted rating
predictions = model.transform(dftest)
predictions.show()
 

+------+-------+------+---------+----------+
|userid|movieid|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|   580|    471|     3|884125018| 3.0697474|
|   588|    471|     5|890024289| 3.9763558|
|   804|    496|     5|879441973|  4.184652|
|   472|    496|     4|875980823| 3.8299975|
|   633|    148|     1|875326138|  3.783264|
|   300|    833|     4|875650329| 5.5967994|
|   406|    496|     4|879445378| 3.6402574|
|    26|    148|     3|891377540| 2.4295914|
|    27|    148|     3|891543129|   2.75539|
|   577|    496|     5|880474455|  4.420952|
|    44|    148|     4|878346946| 3.8073215|
|   159|    471|     4|880485861|  3.867787|
|   806|    496|     5|882387798|  3.966246|
|   103|    471|     4|880416921| 3.0895875|
|   236|    148|     4|890117028|  2.495822|
|   409|    496|     5|881107817| 3.7729216|
|   601|    496|     4|876349302| 3.4476748|
|   128|    471|     4|879967804| 3.3168216|
|   330|    148|     4|876544781| 4.1150646|
|   727|  