## Exercise 2 : Final Inspection

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [4]:
spark = SparkSession.builder.appName("kcore-stage").getOrCreate()

ratings_clean = spark.read.parquet("export_core/pdf_core_ratings.parquet")
books_clean = spark.read.parquet("export_core/pdf_core_books.parquet")

ratings_clean.printSchema()
books_clean.printSchema()

print("rows:", ratings_clean.count())
print("rows:", books_clean.count())

root
 |-- user_id: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- rating: float (nullable = true)

root
 |-- isbn: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- book_author: string (nullable = true)
 |-- year_of_publication: integer (nullable = true)
 |-- publisher: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Language: string (nullable = true)
 |-- Category: string (nullable = true)

rows: 64419
rows: 1148


[x] 1. All `ratings` are within the range [0,10]

In [5]:

ratings_clean.select("rating").summary().show()

invalid_ratings = ratings_clean.filter(
    (F.col("rating") < 0.0) | (F.col("rating") > 10.0) | F.col("rating").isNull()
)

print("Number of invalid ratings in ratings_clean:", invalid_ratings.count())


+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|             64419|
|   mean|2.7266955401356743|
| stddev|3.9078184945141072|
|    min|               0.0|
|    25%|               0.0|
|    50%|               0.0|
|    75%|               7.0|
|    max|              10.0|
+-------+------------------+

Number of invalid ratings in ratings_clean: 0


[x] 2. All `users` with >= 5 `ratings`

In [6]:
user_counts = ratings_clean.groupBy("user_id") \
    .agg(F.count("*").alias("num_ratings"))

user_counts.summary("min", "max", "count").show()

user_counts.orderBy(F.col("num_ratings").asc()).show(10)

users_less_5 = user_counts.filter(F.col("num_ratings") < 5)
print("Users with <5 ratings:", users_less_5.count())

+-------+-------+-----------+
|summary|user_id|num_ratings|
+-------+-------+-----------+
|    min| 100009|          5|
|    max|  99955|        582|
|  count|   3776|       3776|
+-------+-------+-----------+

+-------+-----------+
|user_id|num_ratings|
+-------+-----------+
| 212853|          5|
| 250368|          5|
| 250709|          5|
| 131182|          5|
| 204179|          5|
| 217173|          5|
| 185288|          5|
|  11687|          5|
| 182403|          5|
| 205554|          5|
+-------+-----------+
only showing top 10 rows

Users with <5 ratings: 0


[x] 3. All `books` with >= 5 `ratings`

In [7]:
book_counts = ratings_clean.groupBy("isbn") \
    .agg(F.count("*").alias("num_ratings"))

book_counts.summary("min", "max", "count").show()

book_counts.orderBy(F.col("num_ratings").asc()).show(10)

books_less_5 = book_counts.filter(F.col("num_ratings") < 5)
print("Books with <5 ratings:", books_less_5.count())


+-------+---------+-----------+
|summary|     isbn|num_ratings|
+-------+---------+-----------+
|    min|000649840|          5|
|    max| 99771519|        849|
|  count|     1148|       1148|
+-------+---------+-----------+

+----------+-----------+
|      isbn|num_ratings|
+----------+-----------+
| 679603352|          5|
| 340767936|          5|
| 684844729|          5|
| 871137380|          5|
| 156047624|          5|
| 307132668|          5|
| 911104542|          5|
| 451628276|          5|
|2253049417|          5|
|1563410443|          5|
+----------+-----------+
only showing top 10 rows

Books with <5 ratings: 0


[x] 4. `ISBN` are non-empty strings and in the correct format

In [8]:
# ================= df_core_ratings =================
empty_isbn_ratings = ratings_clean.filter(
    F.col("isbn").isNull() | (F.col("isbn") == "") | (F.length("isbn") == 0)
)
print("df_core_ratings - empty ISBN rows:", empty_isbn_ratings.count())

non_numeric_isbn_ratings = ratings_clean.filter(
    F.col("isbn").rlike("[^0-9]")
)
print("df_core_ratings - ISBN containing non-numeric characters:", non_numeric_isbn_ratings.count())

# ================= df_core_books =================
empty_isbn_books = books_clean.filter(
    F.col("isbn").isNull() | (F.col("isbn") == "") | (F.length("isbn") == 0)
)
print("df_core_books - empty ISBN rows:", empty_isbn_books.count())

non_numeric_isbn_books = books_clean.filter(
    F.col("isbn").rlike("[^0-9]")
)
print("df_core_books - ISBN containing non-numeric characters:", non_numeric_isbn_books.count())


df_core_ratings - empty ISBN rows: 0
df_core_ratings - ISBN containing non-numeric characters: 0
df_core_books - empty ISBN rows: 0
df_core_books - ISBN containing non-numeric characters: 0


[x] 5. No duplicated `User-ID`, `ISBN` rows

In [9]:
total_rows = ratings_clean.count()
distinct_pairs = ratings_clean.select("user_id", "isbn").distinct().count()

print("Total rows in ratings_clean:", total_rows)
print("Distinct (user_id, isbn) pairs:", distinct_pairs)

print("Number of duplicate (user_id, isbn) rows:",
      total_rows - distinct_pairs)


Total rows in ratings_clean: 64419
Distinct (user_id, isbn) pairs: 64419
Number of duplicate (user_id, isbn) rows: 0


[x] 6. Clean `title`, `author`

In [10]:
# Check titles
bad_titles = books_clean.filter(
    (F.col("book_title").isNull()) | (F.col("book_title") == "")
)
print("Books with missing titles (should be 0):", bad_titles.count())

# Check authors
bad_authors = books_clean.filter(
    (F.col("book_author").isNull()) | (F.col("book_author") == "")
)
print("Books with missing authors (should be 0):", bad_authors.count())

# check the number of 'unknown'
books_clean.groupBy("book_title").count().filter(
    F.col("book_title") == "Unknown Title"
).show()

books_clean.groupBy("book_author").count().filter(
    F.col("book_author") == "Unknown Author"
).show()


Books with missing titles (should be 0): 0
Books with missing authors (should be 0): 0
+----------+-----+
|book_title|count|
+----------+-----+
+----------+-----+

+-----------+-----+
|book_author|count|
+-----------+-----+
+-----------+-----+



[x] 7. Consistency check

In [11]:
num_isbn_ratings = ratings_clean.select("isbn").distinct().count()
num_isbn_books = books_clean.select("isbn").distinct().count()

print("Distinct ISBN in ratings_clean:", num_isbn_ratings)
print("Distinct ISBN in books_clean:", num_isbn_books)
print("Consistency:", num_isbn_ratings == num_isbn_books)


Distinct ISBN in ratings_clean: 1148
Distinct ISBN in books_clean: 1148
Consistency: True


## Summary

In [12]:
final_rating_rows = ratings_clean.count()
final_users = ratings_clean.select("user_id").distinct().count()
final_books = ratings_clean.select("isbn").distinct().count()

print("Final ratings_clean rows:", final_rating_rows)
print("Final distinct users:", final_users)
print("Final distinct books:", final_books)

file_path = "data/Books.csv"  
df_raw = spark.read.csv(file_path, header=True, inferSchema=True)
removed_rows = df_raw.count() - final_rating_rows
removed_pct = removed_rows / df_raw.count() * 100

print(f"Removed rows: {removed_rows} ({removed_pct:.2f} % of original)")




Final ratings_clean rows: 64419
Final distinct users: 3776
Final distinct books: 1148
Removed rows: 450920 (87.50 % of original)


### Export `pdf_users_active`

In [None]:
users_active = ratings_clean.groupBy("user_id") \
    .agg(F.count("*").alias("num_ratings"))

pdf_users_active = users_active.toPandas()

import os
import sys

if getattr(sys, 'frozen', False):  
    base_dir = os.path.dirname(sys.executable)
else:  
    base_dir = os.path.dirname(os.path.abspath(__file__))

out_dir = os.path.join(base_dir, "export_core")
os.makedirs(out_dir, exist_ok=True)

output_users_active = os.path.join(out_dir, "pdf_users_active.parquet")

print("Saving ratings to:", output_users_active)

pdf_users_active.to_parquet(output_users_active, index=False)


Saving ratings to: D:\projet_esilv\Mining\export_core\pdf_users_active.parquet


## Final Deliverables
### The cleaned datasets : 
- `ratings_clean`
- `books_clean`
- `users_active`.

In [None]:
print("============================== ratings_clean ==============================")
ratings_clean.show(5, truncate=False)
ratings_clean.printSchema()
print("ratings_clean rows:", ratings_clean.count())

print()
print("============================== books_clean ==============================")
books_clean.show(5, truncate=False)
books_clean.printSchema()
print("books_clean rows:", books_clean.count())

print()
print("============================== users_active ==============================")
users_active.show(5, truncate=False)
users_active.printSchema()
print("users_active count:", users_active.count())

+-------+---------+------+
|user_id|isbn     |rating|
+-------+---------+------+
|100009 |385504209|8.0   |
|100009 |60502258 |6.0   |
|100115 |345465083|0.0   |
|100115 |786868716|10.0  |
|100223 |316789089|9.0   |
+-------+---------+------+
only showing top 5 rows

root
 |-- user_id: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- rating: float (nullable = true)

ratings_clean rows: 64419

+---------+-------------------------------------------------------------+----------------+-------------------+------------------------+-------+--------+--------+
|isbn     |book_title                                                   |book_author     |year_of_publication|publisher               |Summary|Language|Category|
+---------+-------------------------------------------------------------+----------------+-------------------+------------------------+-------+--------+--------+
|000649840|Angelas Ashes                                                |Frank Mccourt   |1994       