In [None]:
# MetaData:

# id: unique identifier for each listing
# name: name of the listing
# host_id: unique identifier for the host
# host_name: name of the host
# neighbourhood_group: grouping of neighbourhoods
# neighbourhood: name of the neighbourhood
# latitude: latitude coordinate of the listing
# longitude: longitude coordinate of the listing
# room_type: type of room in the listing (e.g., Entire home/apt, Private room)
# price: price per night for the listing
# minimum_nights: minimum number of nights required for booking
# number_of_reviews: total number of reviews for the listing
# last_review: date of the last review
# reviews_per_month: average number of reviews per month
# calculated_host_listings_count: total number of listings by the host
# availability_365: number of days the listing is available within the next 365 days

# Example 5 rows of the dataset
# | id   | name             | host_id | host_name | neighbourho| neighbourhood     | latitude | longitude | room_type  | price | mini...| number| last_review | review...| calc...| availability_365 |
# |------|------------------|---------|-----------|------------|-------------------|----------|-----------|------------|-------|--------|-------|-------------|----------|--------|------------------|
# | 2015 | Berlin-Mitte...  | 2217    | Ian       | Mitte      | Brunnenstr. Süd   | 52.5345  | 13.4026   | Entire...  | 60    | 4      | 118   | 28-10-18    | 3.76     | 4      | 141              |
# | 2695 | Prenzlauer...    | 2986    | Michael   | Pankow     | Prenzlauer Berg...| 52.5485  | 13.4046   | Private... | 17    | 2      | 6     | 1/10/2018   | 1.42     | 1      | 0                |
# | 3176 | Fabulous...      | 3718    | Britta    | Pankow     | Prenzlauer Berg...| 52.535   | 13.4176   | Entire...  | 90    | 62     | 143   | 20-03-17    | 1.25     | 1      | 220              |
# | 3309 | BerlinSpot...    | 4108    | Jana      | Tempelhof  | Schöneberg-Nord   | 52.4989  | 13.3491   | Private... | 26    | 5      | 25    | 16-08-18    | 0.39     | 1      | 297              |
# | 7071 | BrightRoom...    | 17391   | Bright    | Pankow     | Helmholtzplatz    | 52.5432  | 13.4151   | Private... | 42    | 2      | 197   | 4/11/2018   | 1.75     | 1      | 26               |


In [None]:
import pymongo
from pymongo import MongoClient
import urllib.parse
from bson.json_util import dumps

# CREDENTIALS
url = "mongodb+srv://<<USERNAME>>:<<PASSWORD>>@<<CLUSTERNAME>>.qgup7.mongodb.net/<<CLUSTERNAME>>?retryWrites=true&w=majority" # replace <<CLUSTERNAME>> with your cluster's name
cluster = MongoClient(url)

db = cluster["<<CLUSTERNAME>>"] # replace <<CLUSTERNAME>> with your database's name
collection = db["berlin"]

In [None]:
# -✔️- Reading listings from CSV into dataframe

import pandas as pd

df = pd.read_csv('C:/Users/asus/Downloads/listings.csv', encoding= 'unicode_escape') # edit path accordingly to read from where the listings.csv file is
df = df.drop(columns=['Unnamed: 0'])
df = df.rename(columns={'name':"title"})

i,j = df.shape

In [None]:
# -✔️- Inserting the listings as documents into DB
listings = []
num = 0

import math
for x in range(0,i):
    # Insert all listings into a list.
    # But we check that the column we are inserting is not empty.
    # (The strip method in Python is used to remove leading and trailing whitespace (spaces, tabs, newlines, etc.) from a string)

    listings = listings + [{'_id': int(df.loc[x].id) if not math.isnan(df.loc[x].id) else None,       # Insert id if it is non empty. If it is, insert None.
                            'listing_title': '%s' % (df.loc[x].title) if str(df.loc[x].title).strip() else None,  # Insert title if it is non empty. If it is, insert None.
                            'host_id': int(df.loc[x].host_id) if not math.isnan(df.loc[x].host_id) else None,  # Insert host_id if it is non empty. If it is, insert None.
                            'host_name': '%s' % (df.loc[x].host_name) if str(df.loc[x].host_name).strip() else None,  # Insert host_name if it is non empty. If it is, insert None.
                            'neighbourhood': '%s' % (df.loc[x].neighbourhood) if str(df.loc[x].neighbourhood).strip() else None,  # Insert neighbourhood if it is non empty. If it is, insert None.
                            'neighbourhood_group': '%s' % (df.loc[x].neighbourhood_group) if str(df.loc[x].neighbourhood_group).strip() else None,  # Insert neighbourhood_group if it is non empty. If it is, insert None.
                            'coordinates': { 'longitude': float(df.loc[x].longitude), 'latitude': float(df.loc[x].latitude) },
                            'room_type': '%s' % (df.loc[x].room_type) if str(df.loc[x].room_type).strip() else None,  # Insert room_type if it is non empty. If it is, insert None.
                            'price': int(df.loc[x].price) if not math.isnan(df.loc[x].price) else None,   # Insert price if it is non empty. If it is, insert None.
                            'minimum_nights': int(df.loc[x].minimum_nights) if not math.isnan(df.loc[x].minimum_nights) else None,  # Insert minimum_nights if it is non empty. If it is, insert None.
                            'availability': int(df.loc[x].availability_365) if not math.isnan(df.loc[x].availability_365) else None}]  # Insert availability if it is non empty. If it is, insert None.
    num = num + 1

collection.insert_many(listings)
print(num,"listings were inserted")

In [None]:
# -✔️- deletes documents from the collection where either the 'price' field or the 'availability' field is None.
# The $or operator is used to specify multiple conditions, and documents matching any of these conditions will be deleted.

collection.delete_many({ '$or': [ { 'price': None }, { 'availability': None } ] })
collection.delete_many({ '$or': [ { 'price': 0 }, { 'availability': 0 } ] })



# -✔️- Remove entries where the price is missing or is zero, as well as documents where the availability is missing or is zero

collection.delete_many({
    '$or': [
        {'price': None},
        {'price': 0},
        {'availability': None},
        {'availability': 0}
    ]
})


In [None]:
# -✔️- Optimizing search results - on neighbourhood_group (showing execution timings before and after optimization)
mitte_listings = collection.find({"neighbourhood_group":"Mitte"}).explain()['executionStats']['executionTimeMillis']
print("Before indexing:",mitte_listings)

collection.create_index('neighbourhood_group')

mitte_listings = collection.find({"neighbourhood_group":"Mitte"}).explain()['executionStats']['executionTimeMillis']
print("After indexing:",mitte_listings)



# --OR--


import time

start_time = time.time()
mitte_listings = list(collection.find({'neighbourhood_group': 'Mitte'}))
end_time = time.time()

print(f"Time taken to execute Mitte neighborhood group query with indexing: {end_time - start_time} seconds")
print(mitte_listings[0])



collection.create_index([('neighborhood_group', 1)])



start_time = time.time()
mitte_listings = list(collection.find({'neighbourhood_group': 'Mitte'}))
end_time = time.time()

print(f"Time taken to execute Mitte neighborhood group query with indexing: {end_time - start_time} seconds")
print(mitte_listings[0])




# drop the index:
# collection.drop_index('neighborhood_group_1')

In [None]:
# -✔️- Finding the top 3 private rooms with minimum amount to pay (will need to drop 0 price first) + displaying the listing_id, cost, duration rent, minimum nights & neighbourhood

result = collection.aggregate(
    [{
        "$match" : { "room_type" : "Private room" }
    },
    {
        "$project" :
            {
                "_id" : "$_id",
                "neighbourhood" : "$neighbourhood",
                "duration_rent" : { "$multiply": [ '$price', '$minimum_nights' ]},
                "price" : "$price",
                "minimum_nights" : "$minimum_nights",
                "room_type" : "$room_type"
            }
    },
    {
        "$sort" : { "duration_rent" : 1 }
    },
    {
        "$limit" : 3
    }]
)

for l in result:
    print("Listing",l['_id'],"costs $",l['duration_rent'],"for",l['minimum_nights'],"day(s) and is in",l['neighbourhood'])
#---------------------------- RESTART ----------------------------

# collection.delete_many({})
# collection.drop_index("neighbourhood_group_1")




# -- OR --



# -✔️- Finding the top 3 private rooms with minimum amount to pay (will need to drop 0 price first) + displaying the listing_id, cost, duration rent, minimum nights & neighbourhood
import pprint

# This MongoDB aggregation pipeline stage uses $match to filter documents in a collection.
# It selects only those documents where the room_type field is equal to 'Private room',
# the price field is greater than ($gt) 0, and the minimum_nights field is greater than ($gt) 0.
# Essentially, it's filtering for private rooms with a positive price and a minimum number of nights greater than 0.

# Create a new attribute 'total_cost' as the combined cost of price and minimum_nights.
pipeline = [
    {
        '$match': {'room_type': 'Private room', 'price': {'$gt': 0}, 'minimum_nights': {'$gt': 0}}
    },
    {
        '$addFields': {'total_cost': {'$multiply': ['$price', '$minimum_nights']}}
    },
    {
        '$sort': {'total_cost': 1}
    },
    {
        '$limit': 3
    }
]

# Execute the aggregation pipeline
result = list(collection.aggregate(pipeline))

# Print the results
pprint.pprint(result)
