In [206]:
from pprint import PrettyPrinter
import pandas as pd
from pymongo import MongoClient

In [207]:
pp = PrettyPrinter(indent=1)

In [208]:
host = "192.55.17.2"
client = MongoClient("mongodb+srv://luvratantechengineer_db_user:dk0saU4SS2fuJ1qw@datascience.f8ymijf.mongodb.net/?retryWrites=true&w=majority&appName=DataScience")

In [209]:
pp.pprint(list(client.list_databases()))

[{'empty': False, 'name': 'air-quality', 'sizeOnDisk': 44662784},
 {'empty': False, 'name': 'admin', 'sizeOnDisk': 364544},
 {'empty': False, 'name': 'local', 'sizeOnDisk': 4686675968}]


In [210]:
db = client["air-quality"]

In [211]:
pp.pprint(list(db.list_collections()))

[{'idIndex': {'key': {'_id': 1}, 'name': '_id_', 'v': 2},
  'info': {'readOnly': False,
           'uuid': Binary(b'+9\xcb\x11\x04\x18@\x87\xaa\x85\x1e\xd0A\xe2\x9a\r', 4)},
  'name': 'lagos',
  'options': {},
  'type': 'collection'},
 {'idIndex': {'key': {'_id': 1}, 'name': '_id_', 'v': 2},
  'info': {'readOnly': False,
           'uuid': Binary(b'6e\xb6\xb6?\xd5O\xc3\xad\x87\xf4`\xc0\xde\xd3\xe9', 4)},
  'name': 'dar-es-salaam',
  'options': {},
  'type': 'collection'},
 {'idIndex': {'key': {'_id': 1}, 'name': '_id_', 'v': 2},
  'info': {'readOnly': False,
           'uuid': Binary(b'\x821\xc3\xbf\xfa\xebK\x00\x8c\xe1d\xb9.\xbcJ(', 4)},
  'name': 'nairobi',
  'options': {},
  'type': 'collection'}]


In [212]:
nairobi = db["nairobi"]

In [213]:
nairobi.count_documents({}) # Count the number of documents in the mongoDb database. 1 document = 1 row 

202212

In [214]:
result = nairobi.find_one({}) # Get a sample data 
pp.pprint(result) 

{'_id': '6525d772f44bfedd842a6fcc',
 'metadata': {'lat': -1.3,
              'lon': 36.785,
              'measurement': 'temperature',
              'sensor_id': 58,
              'sensor_type': 'DHT22',
              'site': 29},
 'temperature': 16.5,
 'timestamp': '2018-09-01 00:00:04.301000'}


- _id': '6525d772f44bfedd842a6fcc', the ID is the unique identifier for that record also known as the primary key
- 'timestamp': '2018-09-01 00:00:04.301000', since this is a time series data, we have a timestamp associated with it.
- 'temperature': 16.5, This is the actual temperatur recorded at that time stamp.
- 'metadata': it is the information about the data. For example, in this case, it is the information of the temperature and timestamp where this data was recorded.

<p style="color:orange"> Data as a whole </p>

In [215]:
# I can see that one document has site as a meta data. So let us check how many sites are their.
keys = result["metadata"].keys()
print(keys)

dict_keys(['lat', 'lon', 'measurement', 'sensor_id', 'sensor_type', 'site'])


In [216]:
nairobi.distinct("metadata.site") #This is similar to unique() function in pandas. We can use it to create groups just like groupby methods using the aggreate
for i in result["metadata"]:
    print(f"{i},  {nairobi.distinct(f"metadata.{i}")}")
    print("-"*30)

lat,  [-1.3, -1.259]
------------------------------
lon,  [36.785, 36.799]
------------------------------
measurement,  ['P1', 'P2', 'humidity', 'temperature']
------------------------------
sensor_id,  [47, 48, 57, 58]
------------------------------
sensor_type,  ['DHT22', 'SDS011']
------------------------------
site,  [6, 29]
------------------------------


In [217]:
# Count documents where the particular property lies
nairobi.count_documents({"metadata.site": 6})

70360

In [218]:
print(f"Documents from site 6: {nairobi.count_documents({"metadata.site": 6})}")
print(f"Documents from site 29: {nairobi.count_documents({"metadata.site": 29})}")

Documents from site 6: 70360
Documents from site 29: 131852


<p style="color:orange; font-size:30px">Aggregate Data</p>

- Aggregation operations process data records and return computed results.
- The $group stage combines multiple documents with the same field, fields, or expression into a single document according to a group key. The result is one document per unique group key.



In [219]:
result = nairobi.aggregate(
    [
        {   "$group":{"_id": "$metadata.site", "count": {"$count":{}}}        }
    ]
)
pp.pprint(list(result))

[{'_id': 6, 'count': 70360}, {'_id': 29, 'count': 131852}]


<p style="color:orange; font-size:30px">Find Data using a query</p>

In [220]:
result = nairobi.find({"metadata.measurement": "P2"}).limit(3)
pp.pprint(list(result))

[{'P2': 34.43,
  '_id': '6525d775f44bfedd842bf24d',
  'metadata': {'lat': -1.3,
               'lon': 36.785,
               'measurement': 'P2',
               'sensor_id': 57,
               'sensor_type': 'SDS011',
               'site': 29},
  'timestamp': '2018-09-01 00:00:02.472000'},
 {'P2': 30.53,
  '_id': '6525d775f44bfedd842bf24e',
  'metadata': {'lat': -1.3,
               'lon': 36.785,
               'measurement': 'P2',
               'sensor_id': 57,
               'sensor_type': 'SDS011',
               'site': 29},
  'timestamp': '2018-09-01 00:05:03.941000'},
 {'P2': 22.8,
  '_id': '6525d775f44bfedd842bf24f',
  'metadata': {'lat': -1.3,
               'lon': 36.785,
               'measurement': 'P2',
               'sensor_id': 57,
               'sensor_type': 'SDS011',
               'site': 29},
  'timestamp': '2018-09-01 00:10:04.374000'}]


In [221]:
result = nairobi.aggregate(
    [
        {"$match":{"metadata.site":29}},
        {"$group":{"_id":"$metadata.measurement", "count":{"$count":{}}}}
    ]
)
pp.pprint(list(result))

[{'_id': 'P1', 'count': 32907},
 {'_id': 'P2', 'count': 32907},
 {'_id': 'temperature', 'count': 33019},
 {'_id': 'humidity', 'count': 33019}]


#### <p style="color:orange"> Importing Data from Mongo DB </>

In [222]:
result = nairobi.find(
    {
        "metadata.site":29,
        "metadata.measurement": "P2"
    },

    projection={"P2":1,"timestamp":1, "_id":0}
)
pp.pprint(result.next()) #.next is same as limit(1)

{'P2': 34.43, 'timestamp': '2018-09-01 00:00:02.472000'}


In [223]:
df = pd.DataFrame(result).set_index("timestamp")
df

Unnamed: 0_level_0,P2
timestamp,Unnamed: 1_level_1
2018-09-01 00:05:03.941000,30.53
2018-09-01 00:10:04.374000,22.80
2018-09-01 00:15:04.245000,13.30
2018-09-01 00:20:04.869000,16.57
2018-09-01 00:25:04.659000,14.07
...,...
2018-12-31 23:35:06.313000,21.92
2018-12-31 23:40:05.904000,20.83
2018-12-31 23:45:05.307000,19.12
2018-12-31 23:50:05.451000,19.10
