# Advanced Database Project
### Group 22
* Enzo Chatalov - fc54414
* Agnieszka Radomska - fc64357
* Duarte Gonçalves - fc64465
* Tommaso Tragno - fc64699

In [None]:
import pandas as pd
import json
from pymongo import MongoClient
import mysql.connector
import time
from sqlalchemy import create_engine,text
from sqlalchemy.exc import PendingRollbackError

## Configuration file

create a `config.json` file with the following structure:

```
{
    "mongo": {
        "username": "your_mongo_username",
        "password": "your_mongo_password",
        "host": "your_mongo_host",
        "port": "your_mongo_port"
    },
    "mysql": {
        "username": "your_mysql_username",
        "password": "your_mysql_password",
        "host": "your_mysql_host",
        "port": "your_mysql_port"
    }
}
```

In [5]:
# load configuration file with password for mongoDB and mySQL
with open('config.json', 'r') as f:
    config = json.load(f)

# Extract username and password for MongoDB and MySQL
mongo_username = config["mongo"]["username"]
mongo_password = config["mongo"]["password"]
mongo_host = config["mongo"]["host"]
mongo_port = config["mongo"]["port"]
mysql_username = config["mysql"]["username"]
mysql_password = config["mysql"]["password"]
mysql_host = config["mysql"]["host"]
mysql_port = config["mysql"]["port"]

In [None]:
# Path specification
path = './kagglehub/datasets/arashnic/book-recommendation-dataset/versions/3'

## Data validation
1. Load the `.csv` file from the path specified;
2. Drop the rows that do not contains a primary key
3. Fill the `na` cells with a predefined value
4. Drop eventualy doplicates
5. Convert the string data into the proper data type

In [None]:
# Load dataset into pandas dataframe
df_books = pd.read_csv(f'{path}/Books.csv')
df_ratings = pd.read_csv(f'{path}/Ratings.csv')
df_users = pd.read_csv(f'{path}/Users.csv')

print('Check NA values presence before data validation:')
print(f'Books data frame: {df_books.isna().any().any()}')
print(f'Ratings data frame: {df_ratings.isna().any().any()}')
print(f'Users data frame: {df_users.isna().any().any()}')

print('\nCheck missing values for primary key columns:')
# Users data validation
orig = df_users.shape[0]
df_users = df_users.dropna(subset=['User-ID'])
count = orig - df_users.shape[0]
print(f'Dropped {count} rows form Users')
df_users = df_users.fillna({'Location': 'not available', 'Age': '0'})

# Books data validation
orig = df_books.shape[0]
df_books = df_books.dropna(subset=['ISBN'])
count = orig - df_books.shape[0]
print(f'Dropped {count} rows from Books')
df_books = df_books.fillna({
    'Book-Title': 'not available', 
    'Book-Author': 'not available', 
    'Year-Of-Publication': '0',
    'Publisher': 'not available', 
    'Image-URL-S': 'not available', 
    'Image-URL-M': 'not available', 
    'Image-URL-L': 'not available'
})

# Ratings data validation
orig = df_ratings.shape[0]
df_ratings = df_ratings.dropna(subset=['User-ID', 'ISBN'])
count = orig - df_ratings.shape[0]
print(f'Dropped {count} rows from Ratings')
df_ratings = df_ratings.fillna({'Book-Rating': '0'})

# Tu wlatuje FIX3
print('\nCheck duplicated rows:')
orig = df_users.shape[0]
df_users = df_users.drop_duplicates()
count = orig - df_users.shape[0]
print(f'Dropped {count} duplicated rows form Users')
orig = df_books.shape[0]
df_books = df_books.drop_duplicates()
count = orig - df_books.shape[0]
print(f'Dropped {count} duplicated rows form Books')
orig = df_ratings.shape[0]
df_ratings = df_ratings.drop_duplicates()
count = orig - df_ratings.shape[0]
print(f'Dropped {count} duplicated rows form Ratings')

# data type conversion
df_users['User-ID'] = pd.to_numeric(df_users['User-ID'], errors='coerce').fillna(0).astype(int)
df_users['Age'] = pd.to_numeric(df_users['Age'], errors='coerce').fillna(0).astype(int)

df_ratings['User-ID'] = pd.to_numeric(df_ratings['User-ID'], errors='coerce').fillna(0).astype(int)
df_ratings['Book-Rating'] = pd.to_numeric(df_ratings['Book-Rating'], errors='coerce').fillna(0).astype(int)

df_books['Year-Of-Publication'] = pd.to_numeric(df_books['Year-Of-Publication'], errors='coerce').fillna(0).astype(int)


print('\nCheck NA values presence after data validation:')
print(f'Books data frame: {df_books.isna().any().any()}')
print(f'Ratings data frame: {df_ratings.isna().any().any()}')
print(f'Users data frame: {df_users.isna().any().any()}')

Check NA values presence before data validation:
Books data frame: True
Ratings data frame: False
Users data frame: True

Check missing values for primary key columns:
Dropped 0 rows form Users
Dropped 0 rows from Books
Dropped 0 rows from Ratings

Check duplicated rows:
Dropped 0 duplicated rows form Users
Dropped 0 duplicated rows form Books
Dropped 0 duplicated rows form Ratings

Check NA values presence after data validation:
Books data frame: False
Ratings data frame: False
Users data frame: False


# MongoDB
## Connects and populate the No-SQL database

In [6]:
# Connect to MongoDB locally
client = MongoClient(f'mongodb://{mongo_host}:{mongo_port}',
                             username = mongo_username,
                             password = mongo_password)

In [None]:
client.drop_database("project")

In [None]:
db = client["project"]

books = db["books"]
ratings = db["ratings"]
users = db["users"]

books.insert_many(df_books.to_dict(orient="records"), ordered=False)
ratings.insert_many(df_ratings.to_dict(orient="records"), ordered=False)
users.insert_many(df_users.to_dict(orient="records"), ordered=False)

# Add a new field to all documents in the 'books' collection
books.update_many({}, {"$set": {"Global_Rating": 0.00}})

print("Data inserted into MongoDB collections successfully.")

## Queries
### Simple 
#### 1- All books published in the year 2000

In [None]:
year = 2000
books_in_year = books.find({"Year-Of-Publication": year})
print(f"Total Number of Books Published in the year {year}: {books.count_documents({'Year-Of-Publication': year})}")
for book in books_in_year:
    bookTitle = book.get("Book-Title")
    bookISBN = book.get("ISBN")
    print(f"ISBN: {bookISBN}, Book Title: {bookTitle}")

#### 2- All users that are older than 30 years old

In [None]:
age = 30
users_older_than_30 = users.find({"Age": {"$gt": age}})
print(f"Total Number of Users older than {age}: {users.count_documents({"Age": {"$gt": age}})}")
for user in users_older_than_30:
    userID = user.get("User-ID")
    print(f"User ID: {userID}")

### Complex
#### 1- Update all ratings from UserID "276890" to 8

In [None]:
userID = 276890
print("Ratings before the update:")
for rating in ratings.find({"User-ID": userID}):
    print(rating)

ratings.update_many({"User-ID": userID}, {"$set": {"Book-Rating": 8}})

print("Ratings after the update:")
for rating in ratings.find({"User-ID": userID}):
    print(rating)

#### 2 - Add a new column in the Books table with the mean ratings of every book ?

In [None]:
pipeline = [
    {
        "$group": {
            "_id": "$ISBN",  # Group by ISBN
            "average_rating": {"$avg": "$Book-Rating"}  # Calculate the average rating
        }
    },
    {
        "$project": {
            "_id": 1,
            "average_rating": {"$round": ["$average_rating", 2]}  # Round to 2 decimal places
        }
    }
]
average_ratings = list(ratings.aggregate(pipeline))

for record in average_ratings:
    isbn = record["_id"]
    avg_rating = record["average_rating"]
    
    # Update the book in the books collection
    books.update_one({"ISBN": isbn}, {"$set": {"Global_Rating": avg_rating}})
    
print("Global ratings added to the books collection successfully.")


[{'_id': '088419342X', 'average_rating': 0.0}, {'_id': '8427003617', 'average_rating': 7.0}, {'_id': '081603494X', 'average_rating': 10.0}, {'_id': '0373833520', 'average_rating': 0.0}, {'_id': '0609810200', 'average_rating': 5.0}, {'_id': '221360875X', 'average_rating': 5.0}, {'_id': '0449214710', 'average_rating': 7.0}, {'_id': '1561708453', 'average_rating': 7.0}, {'_id': '0393850013', 'average_rating': 0.0}, {'_id': '3379015180', 'average_rating': 4.38}, {'_id': '0747238855', 'average_rating': 0.0}, {'_id': '0385486685', 'average_rating': 0.0}, {'_id': '0749932120', 'average_rating': 4.5}, {'_id': '0439516838', 'average_rating': 0.0}, {'_id': '0517029626', 'average_rating': 10.0}]


# MySQL
## Connects to MySql database, create the schema and populate the tables

In [16]:
# Connect to mySQL locally
mydb = mysql.connector.connect(
    host=mysql_host,
    port=mysql_port,
    user=mysql_username,
    password=mysql_password
)

cursor = mydb.cursor()

In [None]:
cursor.execute("DROP DATABASE IF EXISTS project")

In [None]:
cursor.execute("CREATE DATABASE IF NOT EXISTS project")
cursor.execute("USE project")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS users (
        user_id INT PRIMARY KEY,
        location VARCHAR(255),
        age INT
    )
""")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS books (
        ISBN VARCHAR(20) PRIMARY KEY,
        Book_Title VARCHAR(255),
        Book_Author VARCHAR(255),
        Year_Of_Publication INT,
        Publisher VARCHAR(255),
        Image_URL_S VARCHAR(255),
        Image_URL_M VARCHAR(255),
        Image_URL_L VARCHAR(255),
        Global_Rating DECIMAL(4,2)
    )
""")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS ratings (
        User_ID INT,
        ISBN VARCHAR(20),
        Book_Rating INT,
        FOREIGN KEY (User_ID) REFERENCES users(user_id),
        FOREIGN KEY (ISBN) REFERENCES books(ISBN),
        PRIMARY KEY (User_ID, ISBN)
    )
""")

# added try - except statement to catch the problematic rows
for _, row in df_books.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO books (ISBN, Book_Title, Book_Author, Year_Of_Publication, Publisher, Image_URL_S, Image_URL_M, Image_URL_L) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
            (row['ISBN'], row['Book-Title'], row['Book-Author'], row['Year-Of-Publication'], row['Publisher'], row['Image-URL-S'], row['Image-URL-M'], row['Image-URL-L'])
        )
    except:
        print(row)

for _, row in df_users.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO users (user_id, location, age) VALUES (%s, %s, %s)",
            (row['User-ID'], row['Location'], row['Age']) # Tutaj wleciał FIX1
        )
    except:
        print(row)

for _, row in df_ratings.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO ratings (User_ID, ISBN, Book_Rating) VALUES (%s, %s, %s)",
            (row['User-ID'], row['ISBN'], row['Book-Rating'])
        )
    except:
        print(row)

mydb.commit()

print("Data inserted successfully.")



## Queries
### Simple
#### 1- All books published in the year 2000

In [None]:
cursor.execute("SELECT ISBN, Book_Title FROM books WHERE Year_Of_Publication = 2000")
sql_books_in_year = cursor.fetchall()
print(f'There are {len(sql_books_in_year)} users older than 30 years:')
for book in sql_books_in_year:
    bookTitle = book[1]
    bookISBN = book[0]
    print(f"ISBN: {bookISBN}, Book Title: {bookTitle}")

#### 2- All users that are older than 30 years old

In [None]:
cursor.execute("SELECT user_id FROM users WHERE age > 30")
sql_users_above_30 = cursor.fetchall()
print(f'There are {len(sql_users_above_30)} users older than 30 years:')
for user in sql_users_above_30:
    print(f"UserID: {user[0]}")

### Complex
#### 1- Update all ratings from UserID "276890" to 8

In [None]:
userID = 276890
new_rating = 8

print("Ratings before the update:")
cursor.execute("SELECT * FROM ratings WHERE User_ID = %s", (userID,))
for rating in cursor.fetchall():
    print(rating)

cursor.execute("UPDATE ratings SET Book_Rating = %s WHERE User_ID = %s", (new_rating, userID))
mydb.commit()

print("Ratings after the update:")
cursor.execute("SELECT * FROM ratings WHERE User_ID = %s", (userID,))
for rating in cursor.fetchall():
    print(rating)


#### 2 - Add a new column in the Books table with the mean ratings of every book ?

In [None]:
cursor.execute("""
    SELECT 
        ISBN, 
        ROUND(AVG(`Book_Rating`), 2) AS average_rating
    FROM ratings
    GROUP BY ISBN
""")

# Fetch all results
average_ratings = cursor.fetchall()

# Update the books table with the Global_Rating
update_query = """
    UPDATE books 
    SET Global_Rating = %s 
    WHERE ISBN = %s
"""

for isbn, avg_rating in average_ratings:
    cursor.execute(update_query, (avg_rating, isbn))

# Commit the changes
db.commit()

# Time Comparison

In [None]:
start_time = time.time()
cursor.execute("SELECT * FROM ratings WHERE User_ID = 1")
cursor.fetchall()
end_time = time.time()
print("MySQL query time:", end_time - start_time)

start_time = time.time()
list(ratings.find({"User_ID": 1}))
end_time = time.time()
print("MongoDB query time:", end_time - start_time)