<h1 align="center"> Research: MongoDB </h1>

## 1. Preparing the storage cluster for operation

In [1]:
from pymongo import MongoClient

mongodb = MongoClient('mongodb://mongo:27017/testdb?uuidRepresentation=standard').get_default_database()

In [2]:
mongodb.create_collection(
    name='users',
    validator={
        '$jsonSchema': {
            'bsonType': 'object',
            'required': ['_id', 'bookmarks'],
            'properties': {
                '_id': {'bsonType': 'binData'},
                'bookmarks': {
                    'bsonType': 'array',
                    'items': {
                        'bsonType': 'object',
                        'required': ['film_id'],
                        'properties': {
                            'film_id': {'bsonType': 'binData'},
                        },
                    },
                },
            },
        },
    },
)

Collection(Database(MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True, uuidrepresentation=4), 'testdb'), 'users')

In [3]:
mongodb.create_collection(
    name='films',
    validator={
        '$jsonSchema': {
            'bsonType': 'object',
            'required': ['_id', 'votes'],
            'properties': {
                '_id': {'bsonType': 'binData'},
                'votes': {
                    'bsonType': 'array',
                    'items': {
                        'bsonType': 'object',
                        'required': ['user_id', 'score'],
                        'properties': {
                            'user_id': {'bsonType': 'binData'},
                            'score': {'bsonType': 'number'},
                        },
                    },
                },
            },
        },
    },
)

Collection(Database(MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True, uuidrepresentation=4), 'testdb'), 'films')

## 2. Scripts for data generation (movie bookmarks and user ratings)

In [4]:
from random import choice, randrange, sample


def gen_bookmarks(film_ids: list):
    return [
        {'film_id': film_id}
        for film_id in sample(population=film_ids, k=randrange(len(film_ids)))
    ]


def gen_votes(user_ids: list):
    return [
        {'user_id': user_id, 'score': choice([0, 10])}
        for user_id in sample(population=user_ids, k=randrange(len(user_ids)))
    ]

## 3. Load test data (10 million users and 100 thousand movies)

In [5]:
from pymongo import InsertOne
from uuid import uuid4, UUID

users_batch_size = 10000
films_batch_size = 100
multiplier = 1000

for _ in range(multiplier):
    user_ids = [uuid4() for _ in range(users_batch_size)]
    film_ids = [uuid4() for _ in range(films_batch_size)]
    mongodb.get_collection('users').bulk_write(
        requests=[
            InsertOne({'_id': user_id, 'bookmarks': gen_bookmarks(film_ids)}) for user_id in user_ids
        ],
        ordered=False,
    )
    mongodb.get_collection('films').bulk_write(
        requests=[
            InsertOne({'_id': film_id, 'votes': gen_votes(user_ids)}) for film_id in film_ids
        ],
        ordered=False,
    )

In [6]:
mongodb.get_collection('users').count_documents({})

10000000

In [7]:
mongodb.get_collection('films').count_documents({})

100000

## 4. Testing the reading of already loaded data

In [8]:
def random_user_id():
    return choice(list(mongodb.get_collection('users').aggregate(
        [
            {'$sample': {'size': users_batch_size}},
            {'$project': {'_id': '$_id'}},
        ],
    )))['_id']


def random_film_id():
    return choice(list(mongodb.get_collection('films').aggregate(
        [
            {'$sample': {'size': films_batch_size}},
            {'$project': {'_id': '$_id'}},
        ],
    )))['_id']

### `User bookmarks list`

In [9]:
random_user_id()

UUID('8d235240-8f4e-430e-b841-51b42585e5c8')

In [10]:
%%time
mongodb.get_collection('users').find_one(
    filter={'_id': UUID('8d235240-8f4e-430e-b841-51b42585e5c8')},
)['bookmarks']

CPU times: user 3.37 ms, sys: 2 ms, total: 5.37 ms
Wall time: 15.1 ms


[{'film_id': UUID('e825cfbf-1dc7-493b-adb6-a69cf10c20e1')},
 {'film_id': UUID('fdd63f3a-5bb9-43ab-994f-a4590e9dedce')},
 {'film_id': UUID('3ac55156-0fcf-430a-89ac-ff61efbab470')},
 {'film_id': UUID('147838a0-af24-49cd-ac71-15dc9893cc82')},
 {'film_id': UUID('a44639fe-3062-48bc-89c8-e7376f3a942b')},
 {'film_id': UUID('60776367-6961-46c6-947e-558b02c53783')},
 {'film_id': UUID('20378626-b153-48e4-902d-d037b9750c5a')},
 {'film_id': UUID('6130dc73-f3b9-4f19-8a9e-9adf789b76c4')},
 {'film_id': UUID('93604b53-6b71-46ee-b383-7673ba8dc562')},
 {'film_id': UUID('d3d19f36-9552-4d23-80cb-05893a360bad')},
 {'film_id': UUID('a71bf08c-8727-4e97-800a-af307c715a57')},
 {'film_id': UUID('e9ae839e-4238-4929-8e80-b891f46312e1')},
 {'film_id': UUID('0d971f06-82ac-4ca0-880c-821baa75c25f')},
 {'film_id': UUID('3ce888dc-07c5-42be-af3e-6c1072131b97')},
 {'film_id': UUID('93a58284-fdca-46bf-b0fb-486cb7c13cdf')},
 {'film_id': UUID('0921313b-729f-41e3-bcb1-b961370a7a46')},
 {'film_id': UUID('c75a9d55-4c83-4a96-b3

### `The number of likes on a particular movie`

In [11]:
random_film_id()

UUID('15481862-4031-4320-b9c9-6c1c2d8f85c4')

In [12]:
%%time
mongodb.get_collection('films').aggregate(
    pipeline=[
        {'$match': {'_id': UUID('15481862-4031-4320-b9c9-6c1c2d8f85c4')}},
        {'$project': {
            'likes': {
                '$size': {
                    '$filter': {
                        'input': '$votes',
                        'cond': {'$eq': ['$$this.score', 10]},
                    },
                },
            },
        }},
    ],
).next()['likes']

CPU times: user 3.86 ms, sys: 2.01 ms, total: 5.87 ms
Wall time: 34.4 ms


4741

### `Average user rating of the movie`

In [13]:
random_film_id()

UUID('065fdd9d-c9d1-42cb-9225-872810632e36')

In [14]:
%%time
mongodb.get_collection('films').aggregate(
    pipeline=[
        {'$match': {'_id': UUID('065fdd9d-c9d1-42cb-9225-872810632e36')}},
        {'$project': {
            'average_rating': {'$avg': '$votes.score'}
        }},
    ],
).next()['average_rating']

CPU times: user 2.65 ms, sys: 3 ms, total: 5.65 ms
Wall time: 5.77 ms


5.04148365056125

## 5. Testing the reading of data coming in real time

### `Adding a Like`

In [15]:
random_user_id()

UUID('ba1bbddd-42ad-4aac-b7e5-f97b1c10578d')

In [16]:
random_film_id()

UUID('44370745-6ac7-4576-a028-67a0000af66f')

In [17]:
%%time
mongodb.get_collection('films').find_one_and_update(
    filter={'_id': UUID('44370745-6ac7-4576-a028-67a0000af66f')},
    update=[
        {'$set': {
            'votes': {
                '$concatArrays': [
                    {'$filter': {
                        'input': '$votes',
                        'cond': {'$ne': ['$$this.user_id', UUID('ba1bbddd-42ad-4aac-b7e5-f97b1c10578d')]}
                    }},
                    [{'user_id': UUID('ba1bbddd-42ad-4aac-b7e5-f97b1c10578d'), 'score': 10}],
            ]},
        }},
    ],
    return_document=True,
).get('votes')[-1]

CPU times: user 34.9 ms, sys: 2 ms, total: 36.9 ms
Wall time: 60.3 ms


{'user_id': UUID('ba1bbddd-42ad-4aac-b7e5-f97b1c10578d'), 'score': 10}

### `Adding a bookmark`

In [18]:
random_user_id()

UUID('afa28e34-fb5a-4d35-88ba-31b648146d9f')

In [19]:
random_film_id()

UUID('ef298b87-d651-4945-9402-207f65922e9d')

In [20]:
%%time
mongodb.get_collection('users').find_one_and_update(
    filter={'_id': UUID('afa28e34-fb5a-4d35-88ba-31b648146d9f')},
    update={
        '$addToSet': {
            'bookmarks': {
                'film_id': UUID('ef298b87-d651-4945-9402-207f65922e9d'),
            },
        },
    },
    return_document=True,
).get('bookmarks')[-1]

CPU times: user 2.83 ms, sys: 999 µs, total: 3.83 ms
Wall time: 8.07 ms


{'film_id': UUID('ef298b87-d651-4945-9402-207f65922e9d')}