### Importing Libraries for Extraction and Exploration

In [71]:
import pandas as pd
from pprint import PrettyPrinter
from pymongo import MongoClient

# Library for password access from env
import os
from dotenv import load_dotenv 

# Libraries for Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [72]:
# Initializing a PrettyPrinter instnace
p = PrettyPrinter(indent=2)

### Accessing DB password

In [73]:
# Fetching DB password from `.env` file

load_dotenv(".env")
password = os.environ["mongodb_password"] 

### Database Extraction

In [74]:
# Creating a client connection to the MongoDB Server and listing the available databases
connection_string = f"mongodb+srv://usorodave1:{password}@cluster0.ftsywnw.mongodb.net/"
client = MongoClient(connection_string)

p.pprint(list(client.list_databases()))

[ {'empty': False, 'name': 'air-quality', 'sizeOnDisk': 344064},
  {'empty': False, 'name': 'air_quality_data', 'sizeOnDisk': 45056},
  {'empty': False, 'name': 'sample_airbnb', 'sizeOnDisk': 55259136},
  {'empty': False, 'name': 'sample_analytics', 'sizeOnDisk': 9412608},
  {'empty': False, 'name': 'sample_geospatial', 'sizeOnDisk': 1294336},
  {'empty': False, 'name': 'sample_guides', 'sizeOnDisk': 40960},
  {'empty': False, 'name': 'sample_mflix', 'sizeOnDisk': 118423552},
  {'empty': False, 'name': 'sample_restaurants', 'sizeOnDisk': 6836224},
  {'empty': False, 'name': 'sample_supplies', 'sizeOnDisk': 1126400},
  {'empty': False, 'name': 'sample_training', 'sizeOnDisk': 50421760},
  {'empty': False, 'name': 'sample_weatherdata', 'sizeOnDisk': 2711552},
  {'empty': False, 'name': 'admin', 'sizeOnDisk': 237568},
  {'empty': False, 'name': 'local', 'sizeOnDisk': 8924831744}]


### Fetching the Dataset of interest

Since the focus of this project is to predict the Particulate matter `PM2.5` readings in Lagos state, We'll be focusing on fetching the `air-quality` database and further drill down to the `Lagos` collection and use aggregate and find functions to understand the dataset and extract the needed information necessary for model training.

In [75]:
# Assigning the `air-quality` database to a vairable
air_quality = client['air-quality']

In [76]:
# Listing the available collections in the database
p.pprint(list(air_quality.list_collection_names()))

['Lagos', 'system.buckets.Lagos']


In [77]:
# Fetching the `Lagos` collection
lagos = air_quality["Lagos"]


##### Data Exploration

In [78]:
# Number of documents 
lagos.count_documents({})

33152

In [79]:
# To see what the documents look like
first_3 = lagos.find({}).limit(3)

p.pprint(list(first_3))

[ { '_id': ObjectId('658ac93b0196ae65093a11d5'),
    'lat': 6.428,
    'location': 3629,
    'lon': 3.435,
    'sensor_id': 4856,
    'sensor_type': 'DHT22',
    'timestamp': datetime.datetime(2023, 11, 1, 16, 44, 12, 409000),
    'value': 82.6,
    'value_type': 'humidity'},
  { '_id': ObjectId('658ac93b0196ae65093a11d6'),
    'lat': 6.428,
    'location': 3629,
    'lon': 3.435,
    'sensor_id': 4856,
    'sensor_type': 'DHT22',
    'timestamp': datetime.datetime(2023, 11, 1, 16, 44, 12, 409000),
    'value': 29.4,
    'value_type': 'temperature'},
  { '_id': ObjectId('658ac93b0196ae65093a11d7'),
    'lat': 6.428,
    'location': 3629,
    'lon': 3.435,
    'sensor_id': 4855,
    'sensor_type': 'pms5003',
    'timestamp': datetime.datetime(2023, 11, 1, 16, 44, 52, 764000),
    'value': 19.0,
    'value_type': 'P2'}]


From the printed documents, It's obvious that this collection contains different types of air-quality measurements. We need to list different types of readings in `value-type` alongside the total number of readings for each type.

In [80]:
# Find the number of readings for each type
count_reading = lagos.aggregate([
    {"$group": {"_id": "$value_type",
                "count": {"$count": {}}
               }
               }, 
     ])
p.pprint(list(count_reading))

[ {'_id': 'humidity', 'count': 6112},
  {'_id': 'temperature', 'count': 6112},
  {'_id': 'P2', 'count': 6976},
  {'_id': 'P1', 'count': 6976},
  {'_id': 'P0', 'count': 6976}]


We'll now narrow our interest to the particulate matter 2.5 readings, denoted as `P2` in the document

In [81]:
# Extracting the `P2` readings from the documents along with the timestamp

result = lagos.find({"value_type": "P2"},
                   projection={"_id": 0, "value": 1, "timestamp": 1}
                   )

# Changing the outout to a list of dictionaries
result = list(result)
p.pprint(result)

[ { 'timestamp': datetime.datetime(2023, 11, 1, 16, 44, 52, 764000),
    'value': 19.0},
  { 'timestamp': datetime.datetime(2023, 11, 1, 17, 52, 8, 924000),
    'value': 3.5},
  { 'timestamp': datetime.datetime(2023, 11, 1, 17, 53, 13, 805000),
    'value': 3.0},
  { 'timestamp': datetime.datetime(2023, 11, 1, 18, 16, 42, 515000),
    'value': 40.33},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 38, 39, 664000),
    'value': 47.0},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 42, 21, 703000),
    'value': 46.71},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 49, 56, 864000),
    'value': 42.31},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 51, 5, 756000),
    'value': 40.5},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 52, 23, 770000),
    'value': 40.0},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 53, 37, 339000),
    'value': 40.0},
  { 'timestamp': datetime.datetime(2023, 11, 3, 7, 54, 48, 196000),
    'value': 39.0},
  { 'timestamp': datetime.dat

In [82]:
df = pd.DataFrame(result).set_index("timestamp")
df.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2023-11-01 16:44:52.764,19.0
2023-11-01 17:52:08.924,3.5
2023-11-01 17:53:13.805,3.0
2023-11-01 18:16:42.515,40.33
2023-11-03 07:38:39.664,47.0


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6976 entries, 2023-11-01 16:44:52.764000 to 2023-12-21 00:57:56.236000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   value   6976 non-null   float64
dtypes: float64(1)
memory usage: 109.0 KB
