In [2]:
import findspark
findspark.init()

In [32]:
############################TASK 1####################################################

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
spark = SparkSession.builder.getOrCreate()

#creating schema
ratingSchema = StructType([
    StructField("userID", IntegerType(), nullable=False),
    StructField("FilmID", IntegerType(), nullable=False),
    StructField("rating_value", FloatType(), nullable=False)
])

trustSchema = StructType([
    StructField("Trustor", IntegerType(), nullable=False),
    StructField("Trustee", IntegerType(), nullable=False),
    StructField("Trust_Value", FloatType(), nullable=False)
])

#loading data set and printing out
ratingsDF = spark.read.csv("ratings.txt", schema=ratingSchema, header=False, sep=' ')
ratingsDF.show()
ratingCount = ratingsDF.count()

trustDF = spark.read.csv("trust.txt", schema=trustSchema, header=False, sep=' ')
trustDF.show()
trustCount = trustDF.count()

print(ratingCount, trustCount)

#calculating counts after removing records with null values
ratingsDF = ratingsDF.dropna()
ratingAfterNullRemoved = ratingsDF.count()
trustDF = trustDF.dropna()
trustAfterNullRemoved = trustDF.count()
print("Original ratings count: ", ratingCount)
print("New ratings count after removing nulls: ", ratingAfterNullRemoved)
print("Original trust count: ", trustCount)
print("New trust count after removing nulls: ", trustAfterNullRemoved)

#calculating counts after removing records that are duplicates
ratingsDF = ratingsDF.dropDuplicates()
ratingAfterDuplicate = ratingsDF.count()
ratingDiff = ratingAfterNullRemoved - ratingAfterDuplicate
trustDF = trustDF.dropDuplicates()
trustAfterDuplicate = trustDF.count()
trustDiff = trustAfterNullRemoved - trustAfterDuplicate
print("Ratings count before removing duplicates: ", ratingAfterNullRemoved)
print(f"Ratings count after removing duplicates: {ratingAfterDuplicate} ({ratingDiff})")
print("Trust count before removing duplicates: ", trustAfterNullRemoved)
print(f"Trust count after removing duplicates: {trustAfterDuplicate} ({trustDiff})")






+------+------+------------+
|userID|FilmID|rating_value|
+------+------+------------+
|     1|     1|         2.0|
|     1|     2|         4.0|
|     1|     3|         3.5|
|     1|     4|         3.0|
|     1|     5|         4.0|
|     1|     6|         3.5|
|     1|     7|         3.5|
|     1|     8|         3.0|
|     1|     9|         2.5|
|     1|    10|         4.0|
|     1|    11|         4.0|
|     1|    12|         4.0|
|     2|    13|         2.0|
|     3|    14|         0.5|
|     3|    15|         1.0|
|     3|    16|         4.0|
|     3|    17|         3.0|
|     3|    18|         4.0|
|     3|    19|         4.0|
|     3|    20|         2.5|
+------+------+------------+
only showing top 20 rows

+-------+-------+-----------+
|Trustor|Trustee|Trust_Value|
+-------+-------+-----------+
|      2|    966|        1.0|
|      2|    104|        1.0|
|      5|   1509|        1.0|
|      6|   1192|        1.0|
|      7|   1510|        1.0|
|     12|    234|        1.0|
|     15

In [50]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#convert spark dataframes into pandas dataframes for seaborn
ratingspDF = ratingsDF.toPandas()
trustpDF = trustDF.toPandas()

#distribution of ratings
plt.figure(figsize=(10,6))
sns.histplot(ratingsDF['rating_value'], bins=20, kde=True)
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

#top 10 users by number of ratings
topUsers = ratingsDF['userID'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=topUsers.index, y=topUsers.values)
plt.title("Top 10 users by number of ratings")
plt.xlabel("User ID")
plt.ylabel("Number of ratings")
plt.show()

#top 10 movies by number of ratings
topMovies = ratingsDF['FilmID'].value_counts().head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=topMovies.index, y=topMovies.values)
plt.title("Top 10 movies by number of ratings")
plt.xlabel("Movie ID")
plt.ylabel("Number of Ratings")
plt.show()

#summary statistics for trust data
trustSummary = trustDF.describe()
print("\nTrust Summary:")
print(trustSummary)

#distribution of trust values
plt.figure(figsize=(10,6))
sns.histplot(trustDF['Trust_Value'], bins=10, kde=True)
plt.title("Distribution of trust values")
plt.xlabel("Trust values")
plt.ylabel("Frequency")
plt.show()

ModuleNotFoundError: No module named 'distutils'