In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession;

spark = SparkSession.builder.master("local[4]").appName("ISM6562 Spark App01").enableHiveSupport().getOrCreate();

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

# It's best if you find that the port number displayed below is not 4040, then you should shut down all other spark sessions and 
# run this code again. If you don't, you may have trouble accessing the data in the spark-warehouse directory.

23/04/10 07:34:17 WARN Utils: Your hostname, MBP-SMITH515.local resolves to a loopback address: 127.0.0.1; using 192.168.4.81 instead (on interface en0)
23/04/10 07:34:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/10 07:34:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark Session WebUI Port: 4040


In [2]:
# this will set the log level to ERROR. This will hide the INFO or WARNING messages that are printed out by default. If you want to see them, set this to INFO or WARN.
sc.setLogLevel("ERROR")

In [3]:
spark

In [4]:
spark.sql("show tables").show()

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|  default|      boston|      false|
|  default|fake_friends|      false|
|  default|movieratings|      false|
|  default|      movies|      false|
|  default|      u_item|      false|
+---------+------------+-----------+



In [5]:
df = spark.sql("select count(DISTINCT movieid) from movieratings")
df.show()

+-----------------------+
|count(DISTINCT movieid)|
+-----------------------+
|                   1682|
+-----------------------+



In [6]:
df = spark.sql("select count(*) from movieratings")
df.show()

+--------+
|count(1)|
+--------+
|  100000|
+--------+



In [7]:
dfRatingCount = spark.sql("select count(movieid) as count, round(avg(rating),3) as rating  \
                    from movieratings group by movieid order by count desc, rating desc")
dfRatingCount.show()

+-----+------+
|count|rating|
+-----+------+
|  583| 4.358|
|  509| 3.804|
|  508| 4.156|
|  507| 4.008|
|  485| 3.157|
|  481| 3.657|
|  478| 3.441|
|  452| 3.878|
|  431| 3.631|
|  429| 3.438|
|  420| 4.252|
|  413| 4.283|
|  394| 4.061|
|  392| 3.798|
|  390|  4.29|
|  384| 3.711|
|  378| 3.693|
|  367| 4.204|
|  365|  3.66|
|  350| 4.246|
+-----+------+
only showing top 20 rows



In [8]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
 
# For more on VectorAssembler, see https://spark.apache.org/docs/latest/ml-features.html#vectorassembler 
dfAssemblerFeature =  VectorAssembler(
    inputCols=["count"], 
    outputCol="features"
)
 
dfRatingCount = dfAssemblerFeature.transform(dfRatingCount)
dfRatingCount.show()

+-----+------+--------+
|count|rating|features|
+-----+------+--------+
|  583| 4.358| [583.0]|
|  509| 3.804| [509.0]|
|  508| 4.156| [508.0]|
|  507| 4.008| [507.0]|
|  485| 3.157| [485.0]|
|  481| 3.657| [481.0]|
|  478| 3.441| [478.0]|
|  452| 3.878| [452.0]|
|  431| 3.631| [431.0]|
|  429| 3.438| [429.0]|
|  420| 4.252| [420.0]|
|  413| 4.283| [413.0]|
|  394| 4.061| [394.0]|
|  392| 3.798| [392.0]|
|  390|  4.29| [390.0]|
|  384| 3.711| [384.0]|
|  378| 3.693| [378.0]|
|  367| 4.204| [367.0]|
|  365|  3.66| [365.0]|
|  350| 4.246| [350.0]|
+-----+------+--------+
only showing top 20 rows



In [9]:
dfRatingCount = dfRatingCount.select("features", "rating")
dfRatingCount.show()

+--------+------+
|features|rating|
+--------+------+
| [583.0]| 4.358|
| [509.0]| 3.804|
| [508.0]| 4.156|
| [507.0]| 4.008|
| [485.0]| 3.157|
| [481.0]| 3.657|
| [478.0]| 3.441|
| [452.0]| 3.878|
| [431.0]| 3.631|
| [429.0]| 3.438|
| [420.0]| 4.252|
| [413.0]| 4.283|
| [394.0]| 4.061|
| [392.0]| 3.798|
| [390.0]|  4.29|
| [384.0]| 3.711|
| [378.0]| 3.693|
| [367.0]| 4.204|
| [365.0]|  3.66|
| [350.0]| 4.246|
+--------+------+
only showing top 20 rows



In [10]:
from pyspark.ml.stat import ChiSquareTest
r = ChiSquareTest.test(dfRatingCount, "features", "rating").head()
 
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.0]
degreesOfFreedom: [213248]
statistics: [339830.8603909695]


In [11]:
# For more information on LinearRegression, see https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression
lr = LinearRegression(maxIter=10, featuresCol="features", labelCol="rating", predictionCol="prediction")
# Fit the model
lrModel = lr.fit(dfRatingCount)
 
 
# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lrModel.coefficients[0]:.5f}")
print(f"Intercept: {lrModel.intercept:.5f}")

Coefficients: 0.00418
Intercept: 2.82766


In [12]:
dfRatingCount = lrModel.transform(dfRatingCount)
dfRatingCount.show(100)

+--------+------+------------------+
|features|rating|        prediction|
+--------+------+------------------+
| [583.0]| 4.358| 5.263360163655976|
| [509.0]| 3.804| 4.954197665501374|
| [508.0]| 4.156| 4.950019793904691|
| [507.0]| 4.008| 4.945841922308006|
| [485.0]| 3.157| 4.853928747180962|
| [481.0]| 3.657| 4.837217260794227|
| [478.0]| 3.441| 4.824683646004176|
| [452.0]| 3.878| 4.716058984490396|
| [431.0]| 3.631| 4.628323680960037|
| [429.0]| 3.438| 4.619967937766669|
| [420.0]| 4.252| 4.582367093396514|
| [413.0]| 4.283| 4.553121992219728|
| [394.0]| 4.061|4.4737424318827355|
| [392.0]| 3.798| 4.465386688689367|
| [390.0]|  4.29|    4.457030945496|
| [384.0]| 3.711| 4.431963715915897|
| [378.0]| 3.693|4.4068964863357944|
| [367.0]| 4.204| 4.360939898772272|
| [365.0]|  3.66| 4.352584155578905|
| [350.0]| 4.246| 4.289916081628647|
| [350.0]| 3.834| 4.289916081628647|
| [344.0]| 3.314| 4.264848852048544|
| [336.0]| 4.045| 4.231425879275074|
| [331.0]| 3.931| 4.210536521291655|
|

In [13]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print(f"RMSE: {trainingSummary.rootMeanSquaredError:.4f}")
print(f"r2: {trainingSummary.r2:.4f}")

RMSE: 0.7056
r2: 0.1846
