In [1]:
from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/10/26 15:16:47 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:16:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:16:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [2]:
spark

In [3]:
spark.sql("show tables").show()

23/10/26 15:16:52 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/10/26 15:16:52 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/10/26 15:16:54 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/10/26 15:16:54 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore student@127.0.0.1
23/10/26 15:16:54 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|  default|fake_friends|      false|
|  default|   incidents|      false|
|  default|movieratings|      false|
|  default|      movies|      false|
+---------+------------+-----------+



In [4]:
df = spark.sql("select count(DISTINCT movieid) from movieratings")
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----------------------+
|count(DISTINCT movieid)|
+-----------------------+
|                   1682|
+-----------------------+



                                                                                

In [5]:
df = spark.sql("select count(*) from movieratings")
df.show()

+--------+
|count(1)|
+--------+
|  100000|
+--------+



In [6]:
dfRatingCount = spark.sql("select count(movieid) as count, round(avg(rating),3) as rating  \
                    from movieratings group by movieid order by count desc, rating desc")
dfRatingCount.show()

+-----+------+
|count|rating|
+-----+------+
|  583| 4.358|
|  509| 3.804|
|  508| 4.156|
|  507| 4.008|
|  485| 3.157|
|  481| 3.657|
|  478| 3.441|
|  452| 3.878|
|  431| 3.631|
|  429| 3.438|
|  420| 4.252|
|  413| 4.283|
|  394| 4.061|
|  392| 3.798|
|  390|  4.29|
|  384| 3.711|
|  378| 3.693|
|  367| 4.204|
|  365|  3.66|
|  350| 4.246|
+-----+------+
only showing top 20 rows



In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
 
# For more on VectorAssembler, see https://spark.apache.org/docs/latest/ml-features.html#vectorassembler 
dfAssemblerFeature =  VectorAssembler(
    inputCols=["count"], 
    outputCol="features"
)
 
dfRatingCount = dfAssemblerFeature.transform(dfRatingCount)
dfRatingCount.show()

+-----+------+--------+
|count|rating|features|
+-----+------+--------+
|  583| 4.358| [583.0]|
|  509| 3.804| [509.0]|
|  508| 4.156| [508.0]|
|  507| 4.008| [507.0]|
|  485| 3.157| [485.0]|
|  481| 3.657| [481.0]|
|  478| 3.441| [478.0]|
|  452| 3.878| [452.0]|
|  431| 3.631| [431.0]|
|  429| 3.438| [429.0]|
|  420| 4.252| [420.0]|
|  413| 4.283| [413.0]|
|  394| 4.061| [394.0]|
|  392| 3.798| [392.0]|
|  390|  4.29| [390.0]|
|  384| 3.711| [384.0]|
|  378| 3.693| [378.0]|
|  367| 4.204| [367.0]|
|  365|  3.66| [365.0]|
|  350| 4.246| [350.0]|
+-----+------+--------+
only showing top 20 rows



In [8]:
dfRatingCount = dfRatingCount.select("features", "rating")
dfRatingCount.show()

+--------+------+
|features|rating|
+--------+------+
| [583.0]| 4.358|
| [509.0]| 3.804|
| [508.0]| 4.156|
| [507.0]| 4.008|
| [485.0]| 3.157|
| [481.0]| 3.657|
| [478.0]| 3.441|
| [452.0]| 3.878|
| [431.0]| 3.631|
| [429.0]| 3.438|
| [420.0]| 4.252|
| [413.0]| 4.283|
| [394.0]| 4.061|
| [392.0]| 3.798|
| [390.0]|  4.29|
| [384.0]| 3.711|
| [378.0]| 3.693|
| [367.0]| 4.204|
| [365.0]|  3.66|
| [350.0]| 4.246|
+--------+------+
only showing top 20 rows



In [9]:
from pyspark.ml.stat import ChiSquareTest
r = ChiSquareTest.test(dfRatingCount, "features", "rating").head()
 
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.0]
degreesOfFreedom: [213248]
statistics: [339830.8603909695]


In [10]:
# For more information on LinearRegression, see https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression
lr = LinearRegression(maxIter=10, featuresCol="features", labelCol="rating", predictionCol="prediction")
# Fit the model
lrModel = lr.fit(dfRatingCount)
 
 
# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lrModel.coefficients[0]:.5f}")
print(f"Intercept: {lrModel.intercept:.5f}")

23/10/26 15:17:04 WARN Instrumentation: [13db7844] regParam is zero, which might cause numerical instability and overfitting.


Coefficients: 0.00418
Intercept: 2.82766


In [11]:
dfRatingCount = lrModel.transform(dfRatingCount)
dfRatingCount.show(100)

+--------+------+------------------+
|features|rating|        prediction|
+--------+------+------------------+
| [583.0]| 4.358| 5.263360163655971|
| [509.0]| 3.804|  4.95419766550137|
| [508.0]| 4.156| 4.950019793904686|
| [507.0]| 4.008| 4.945841922308002|
| [485.0]| 3.157| 4.853928747180959|
| [481.0]| 3.657| 4.837217260794223|
| [478.0]| 3.441| 4.824683646004172|
| [452.0]| 3.878|4.7160589844903935|
| [431.0]| 3.631| 4.628323680960033|
| [429.0]| 3.438| 4.619967937766665|
| [420.0]| 4.252|4.5823670933965115|
| [413.0]| 4.283| 4.553121992219724|
| [394.0]| 4.061| 4.473742431882732|
| [392.0]| 3.798| 4.465386688689365|
| [390.0]|  4.29|4.4570309454959975|
| [384.0]| 3.711| 4.431963715915894|
| [378.0]| 3.693| 4.406896486335792|
| [367.0]| 4.204| 4.360939898772269|
| [365.0]|  3.66| 4.352584155578902|
| [350.0]| 4.246| 4.289916081628645|
| [350.0]| 3.834| 4.289916081628645|
| [344.0]| 3.314| 4.264848852048543|
| [336.0]| 4.045| 4.231425879275072|
| [331.0]| 3.931| 4.210536521291653|
|

In [12]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print(f"RMSE: {trainingSummary.rootMeanSquaredError:.4f}")
print(f"r2: {trainingSummary.r2:.4f}")

RMSE: 0.7056
r2: 0.1846


In [13]:
spark.stop()