# Advanced Database Project
### Group 22
* Tommaso Tragno - fc64699
* Enzo Chatalov - fc54414
* Duarte Alexandre Pedro Gonçalves - fc64465
* Agnieszka Radomska - fc64357

In [24]:
import pandas as pd
import json
from pymongo import MongoClient
import mysql.connector
import time
from sqlalchemy import create_engine,text
from sqlalchemy.exc import PendingRollbackError
import kagglehub

## Configuration file

create a `config.json` file with the following structure:

```
{
    "mongo": {
        "username": "your_mongo_username",
        "password": "your_mongo_password",
        "host": "your_mongo_host",
        "port": "your_mongo_port"
    },
    "mysql": {
        "username": "your_mysql_username",
        "password": "your_mysql_password",
        "host": "your_mysql_host",
        "port": "your_mysql_port"
    }
}
```

In [25]:
# load configuration file with password for mongoDB and mySQL
with open('config.json', 'r') as f:
    config = json.load(f)

# Extract username and password for MongoDB and MySQL
mongo_username = config["mongo"]["username"]
mongo_password = config["mongo"]["password"]
mongo_host = config["mongo"]["host"]
mongo_port = config["mongo"]["port"]
mysql_username = config["mysql"]["username"]
mysql_password = config["mysql"]["password"]
mysql_host = config["mysql"]["host"]
mysql_port = config["mysql"]["port"]

In [26]:
# Download latest dataset version for books recommendation
# path = kagglehub.dataset_download("arashnic/book-recommendation-dataset")

# Manual path specification
path = './kagglehub/datasets/arashnic/book-recommendation-dataset/versions/3'

## Data validation
1. Load the `.csv` file from the path specified;
2. Drop the rows that do not contains a primary key
3. Fill the `na` cells with a predefined value
4. Drop eventualy doplicates
5. Convert the string data into the proper data type

In [27]:
# Load dataset into pandas dataframe
df_books = pd.read_csv(f'{path}/Books.csv')
df_ratings = pd.read_csv(f'{path}/Ratings.csv')
df_users = pd.read_csv(f'{path}/Users.csv')

print('Check NA values presence before data validation')
print(f'Books data frame: {df_books.isna().any().any()}')
print(f'Ratings data frame: {df_ratings.isna().any().any()}')
print(f'Users data frame: {df_users.isna().any().any()}')


df_users = df_users.dropna(subset=['User-ID'])
df_users = df_users.fillna({'Location': 'not available', 'Age': '0'})
df_books = df_books.dropna(subset=['ISBN'])
df_books = df_books.fillna({
    'Book-Title': 'not available', 
    'Book-Author': 'not available', 
    'Year-Of-Publication': '0',
    'Publisher': 'not available', 
    'Image-URL-S': 'not available', 
    'Image-URL-M': 'not available', 
    'Image-URL-L': 'not available'
})
df_ratings = df_ratings.dropna(subset=['User-ID', 'ISBN'])
df_ratings = df_ratings.fillna({'Book-Rating': '0'})

# Tu wlatuje FIX3
df_users = df_users.drop_duplicates()
df_books = df_books.drop_duplicates()
df_ratings = df_ratings.drop_duplicates()

# data type conversion
df_users['User-ID'] = pd.to_numeric(df_users['User-ID'], errors='coerce').fillna(0).astype(int)
df_users['Age'] = pd.to_numeric(df_users['Age'], errors='coerce').fillna(0).astype(int)

df_ratings['User-ID'] = pd.to_numeric(df_ratings['User-ID'], errors='coerce').fillna(0).astype(int)
df_ratings['Book-Rating'] = pd.to_numeric(df_ratings['Book-Rating'], errors='coerce').fillna(0).astype(int)

df_books['Year-Of-Publication'] = pd.to_numeric(df_books['Year-Of-Publication'], errors='coerce').fillna(0).astype(int)


print('Check NA values presence after data validation')
print(f'Books data frame: {df_books.isna().any().any()}')
print(f'Ratings data frame: {df_ratings.isna().any().any()}')
print(f'Users data frame: {df_users.isna().any().any()}')

Check NA values presence before data validation
Books data frame: True
Ratings data frame: False
Users data frame: True
Check NA values presence after data validation
Books data frame: False
Ratings data frame: False
Users data frame: False


# MongoDB
## Connects and populate the No-SQL database

In [28]:
# Connect to MongoDB locally
client = MongoClient(f'mongodb://{mongo_host}:{mongo_port}',
                             username = mongo_username,
                             password = mongo_password)

In [29]:
client.drop_database("project")

In [30]:
db = client["project"]

books = db["books"]
ratings = db["ratings"]
users = db["users"]

books.insert_many(df_books.to_dict(orient="records"), ordered=False)
ratings.insert_many(df_ratings.to_dict(orient="records"), ordered=False)
users.insert_many(df_users.to_dict(orient="records"), ordered=False)

print("Data inserted into MongoDB collections successfully.")

Data inserted into MongoDB collections successfully.


### Queries

In [44]:
year = 2000
books_in_year = books.find({"Year-Of-Publication": year})
print(f'Books published in the year {str(year)}:')
for book in books_in_year:
    print(book)

Books published in the year 2000:
{'_id': ObjectId('673cac4b73459dfbc018537f'), 'ISBN': '0425176428', 'Book-Title': "What If?: The World's Foremost Military Historians Imagine What Might Have Been", 'Book-Author': 'Robert Cowley', 'Year-Of-Publication': 2000, 'Publisher': 'Berkley Publishing Group', 'Image-URL-S': 'http://images.amazon.com/images/P/0425176428.01.THUMBZZZ.jpg', 'Image-URL-M': 'http://images.amazon.com/images/P/0425176428.01.MZZZZZZZ.jpg', 'Image-URL-L': 'http://images.amazon.com/images/P/0425176428.01.LZZZZZZZ.jpg'}
{'_id': ObjectId('673cac4b73459dfbc0185384'), 'ISBN': '080652121X', 'Book-Title': "Hitler's Secret Bankers: The Myth of Swiss Neutrality During the Holocaust", 'Book-Author': 'Adam Lebor', 'Year-Of-Publication': 2000, 'Publisher': 'Citadel Press', 'Image-URL-S': 'http://images.amazon.com/images/P/080652121X.01.THUMBZZZ.jpg', 'Image-URL-M': 'http://images.amazon.com/images/P/080652121X.01.MZZZZZZZ.jpg', 'Image-URL-L': 'http://images.amazon.com/images/P/080652

In [None]:
ratings.update_one(
    {"User_ID": 1, "ISBN": "1234567890"},
    {"$set": {"Book_Rating": 5}}
)
print("Rating updated for User_ID 1 and ISBN 1234567890.")

Rating updated for User_ID 1 and ISBN 1234567890.


### Indexes

In [33]:
ratings.create_index([("User_ID", 1)])
ratings.create_index([("ISBN", 1)])
print("Indexes added to MongoDB ratings collection.")

Indexes added to MongoDB ratings collection.


# MySQL
## Connects to MySql database, create the schema and populate the tables

In [34]:
# Connect to mySQL locally
mydb = mysql.connector.connect(
    host=mysql_host,
    port=mysql_port,
    user=mysql_username,
    password=mysql_password
)

cursor = mydb.cursor()

In [35]:
cursor.execute("DROP DATABASE IF EXISTS project")

In [36]:
cursor.execute("CREATE DATABASE IF NOT EXISTS project")
cursor.execute("USE project")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS users (
        user_id INT PRIMARY KEY,
        location VARCHAR(255),
        age INT
    )
""")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS books (
        ISBN VARCHAR(20) PRIMARY KEY,
        Book_Title VARCHAR(255),
        Book_Author VARCHAR(255),
        Year_Of_Publication INT,
        Publisher VARCHAR(255),
        Image_URL_S VARCHAR(255),
        Image_URL_M VARCHAR(255),
        Image_URL_L VARCHAR(255),
        Global_Rating FLOAT
    )
""")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS ratings (
        User_ID INT,
        ISBN VARCHAR(20),
        Book_Rating INT,
        FOREIGN KEY (User_ID) REFERENCES users(user_id),
        FOREIGN KEY (ISBN) REFERENCES books(ISBN),
        PRIMARY KEY (User_ID, ISBN)
    )
""")

# added try - except statement to catch the problematic rows
for _, row in df_books.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO books (ISBN, Book_Title, Book_Author, Year_Of_Publication, Publisher, Image_URL_S, Image_URL_M, Image_URL_L) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
            (row['ISBN'], row['Book-Title'], row['Book-Author'], row['Year-Of-Publication'], row['Publisher'], row['Image-URL-S'], row['Image-URL-M'], row['Image-URL-L'])
        )
    except:
        print(row)

for _, row in df_users.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO users (user_id, location, age) VALUES (%s, %s, %s)",
            (row['User-ID'], row['Location'], row['Age']) # Tutaj wleciał FIX1
        )
    except:
        print(row)

for _, row in df_ratings.iterrows():
    try:
        cursor.execute(
            "INSERT IGNORE INTO ratings (User_ID, ISBN, Book_Rating) VALUES (%s, %s, %s)",
            (row['User-ID'], row['ISBN'], row['Book-Rating'])
        )
    except:
        print(row)

mydb.commit()

print("Data inserted successfully.")



Data inserted successfully.


### Queries

In [40]:
cursor.execute("SELECT user_id FROM users WHERE age > 30")
users_above_30 = cursor.fetchall()
print(f'There are {len(users_above_30)} users older than 30 years:')
for user in users_above_30:
    print(user)

There are 91816 users older than 30 years:
(6,)
(21,)
(25,)
(27,)
(33,)
(38,)
(40,)
(44,)
(46,)
(51,)
(54,)
(63,)
(64,)
(67,)
(70,)
(72,)
(75,)
(85,)
(89,)
(90,)
(93,)
(94,)
(99,)
(100,)
(103,)
(104,)
(105,)
(112,)
(114,)
(117,)
(119,)
(124,)
(125,)
(129,)
(132,)
(133,)
(139,)
(144,)
(148,)
(157,)
(158,)
(164,)
(165,)
(168,)
(172,)
(174,)
(176,)
(177,)
(182,)
(189,)
(190,)
(191,)
(196,)
(199,)
(206,)
(210,)
(216,)
(218,)
(221,)
(228,)
(242,)
(245,)
(251,)
(252,)
(255,)
(259,)
(260,)
(261,)
(281,)
(284,)
(299,)
(301,)
(302,)
(308,)
(311,)
(317,)
(319,)
(320,)
(328,)
(331,)
(332,)
(341,)
(347,)
(353,)
(360,)
(362,)
(367,)
(368,)
(374,)
(376,)
(378,)
(379,)
(388,)
(390,)
(392,)
(397,)
(399,)
(404,)
(406,)
(409,)
(410,)
(414,)
(416,)
(421,)
(425,)
(431,)
(433,)
(436,)
(438,)
(443,)
(444,)
(450,)
(451,)
(454,)
(455,)
(456,)
(457,)
(459,)
(471,)
(475,)
(477,)
(480,)
(482,)
(483,)
(485,)
(486,)
(492,)
(496,)
(497,)
(499,)
(502,)
(503,)
(505,)
(512,)
(524,)
(526,)
(530,)
(531,)
(532,)
(533,)
(

In [49]:
cursor.execute("""
    INSERT IGNORE INTO books (ISBN, Book_Title, Book_Author, Year_Of_Publication, Publisher, Image_URL_S, Image_URL_M, Image_URL_L)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""", ('1234567890', 'New Sci-Fi Book', 'Author Name', 2023, 'Sci-Fi Publisher', 'url_s', 'url_m', 'url_l'))

cursor.execute("INSERT IGNORE INTO ratings (User_ID, ISBN, Book_Rating) VALUES (%s, %s, %s)", (1, '1234567890', 4.5))
mydb.commit()
print("New book inserted and rated by a user.")

New book inserted and rated by a user.


### Indexes

In [52]:
cursor.execute("CREATE INDEX idx_user_id ON ratings(User_ID)")
cursor.execute("CREATE INDEX idx_isbn ON ratings(ISBN)")
print("Indexes added to Ratings table.")

Indexes added to Ratings table.


# Time Comparison

In [54]:
start_time = time.time()
cursor.execute("SELECT * FROM ratings WHERE User_ID = 1")
cursor.fetchall()
end_time = time.time()
print("MySQL query time:", end_time - start_time)

start_time = time.time()
list(ratings.find({"User_ID": 1}))
end_time = time.time()
print("MongoDB query time:", end_time - start_time)

MySQL query time: 0.011967897415161133
MongoDB query time: 0.0024831295013427734
