In [2]:
# import necessaries
import os
import csv
import json
import yaml
import pandas as pd

# Get the path of the data folder
config = None
with open(r'.\config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
assert config is not None

# Declare and initialise the variables for file paths
air_quality_path = None
epi_path = None
output_path = None

# Assign the file path variables if config exists
if config is not None:
    air_quality_path = os.path.join(config['air_pollution']['root'], config['air_pollution']['data']['air_quality'])
    epi_path = os.path.join(config['air_pollution']['root'], config['air_pollution']['data']['epi'])
    output_path = os.path.join(config['air_pollution']['root'], config['air_pollution']['data']['output'])
assert air_quality_path is not None
assert epi_path is not None
assert output_path is not None

# Create a function to retrieve a csv file
def get_csv(path, sep):
    csv = pd.read_csv(path, sep=sep, engine='python')
    return csv

# Get csv files

In [None]:
# Retrieve air_quality file
air_csv = get_csv(air_quality_path, ',')

# Organise data
#air_csv = air_csv.rename(columns=air_csv.iloc[0])
air_csv = air_csv.groupby(by=['country'], as_index=False).max().sort_values('2022', ascending=False)
air_csv = air_csv.loc[:, ['country','2020','2021','2022']]
air_csv = air_csv.dropna()
air_csv = air_csv.rename(columns={'country':'Country'})
#air_csv = air_csv.dropna()

# Print
print(air_csv.head())

In [None]:
# Retrieve epi result file
epi_csv = get_csv(epi_path, ',')

# Organise data
#epi_csv = epi_csv.rename(columns=epi_csv.iloc[0])
epi_csv = epi_csv.loc[:, ['country','iso','EPI']]
epi_csv = epi_csv.rename(columns={'country':'Country', 'iso':'ISO_Code'})

# Print
print(epi_csv.head())

In [None]:
# Retrieve manufacturing file
output_csv = get_csv(output_path, ',')

# Organise data
#output.columns = output.columns.astype(str)
#output = output.replace('"', '', regex=True)
#output = output.rename(columns=output.iloc[0])
output_csv = output_csv.loc[:, ['Country Name','Country Code','2020','2021','2022']]
output_csv = output_csv.rename(columns={'Country Name':'Country', 'Country Code':'ISO_Code'})

# Print
print(output_csv.head())

# MongoDB connection

In [None]:
import urllib.parse
from pymongo.mongo_client import MongoClient
#from pymongo.server_api import ServerApi

username = urllib.parse.quote_plus('welcometosorapark')
password = urllib.parse.quote_plus("srmongdb")

url = f"mongodb+srv://{username}:{password}@cluster0.ogshahn.mongodb.net/test?authSource=admin&retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(url)

from pymongo.errors import ConnectionFailure
try:
    # The ping command is cheap and does not require auth.
    client.admin.command('ping')
    print("Server available!")
except ConnectionFailure:
    print("Server not available")

In [None]:
# Test
dbs = client.list_database_names()
print(dbs)

# Store datasets to MongoDB database

In [None]:
db = client.test

# Drop existing collections so as to newly add them
db.drop_collection("air_quality")
db.drop_collection("epi")
db.drop_collection("output")

In [None]:
data = air_csv.to_dict(orient='records')
collection = db.get_collection('air_quality')
air_quality_inserted = collection.insert_many(data)

data = epi_csv.to_dict(orient='records')
collection = db.get_collection('epi')
epi_inserted = collection.insert_many(data)

data = output_csv.to_dict(orient='records')
collection = db.get_collection('output')
output_inserted = collection.insert_many(data)

# Get MongoDB data

In [None]:
air_quality_db = db.air_quality
epi_db = db.epi
output_db = db.output

### 1. Using 'find_one', print a single row of datasets

In [None]:
#air_quality_df= pd.DataFrame(list(air_quality_db.find({})))
air_quality_db.find_one()

In [None]:
epi_db.find_one()

In [None]:
output_db.find_one()

### 2. Find  a row of three datasets containing 'Pakistan'

In [None]:
air_quality_db.find_one({"Country": "Pakistan"})

In [None]:
epi_db.find_one({"Country": "Pakistan"})

In [None]:
output_db.find_one({"Country": "Pakistan"})

### 3. Find the rows containing values greater than OR lower than a specific size

In [None]:
for x in air_quality_db.find({"2022": {"$gt":90}}):
    print(x)

In [None]:
for x in epi_db.find({"EPI": {"$lt":20}}):
    print(x)

In [None]:
for x in output_db.find({"2022": {"$gt":8000000000000}}):
    print(x)

# Group three datasets into a single table

In [None]:
rst = air_quality_db.aggregate([
    # Group by Country
    {"$group": {"_id":"$Country", "2022":{"$max":"$2022"}}},
    # Join with EPI
    {"$lookup": {"from":"epi", "localField":"_id", "foreignField":"Country", "as":"test"}},
    {"$unwind":"$test"},
    {"$project": {"ISOCode":"$test.ISO_Code", "Country":"$_id", "AveragePM":"$2022", "EPI":"$test.EPI"}},
    # Join with Output
    {"$lookup": {"from":"output", "localField":"Country", "foreignField":"Country", "as":"test2"}},
    {"$unwind":"$test2"},
    {"$project": {"ISOCode":"$test2.ISO_Code", "Country":"$Country", "AveragePM":"$AveragePM", "EPI":"$EPI", "Output":"$test2.2022"}},    
    # Sort on the field of '2022' in descending order
    {"$sort": {"AveragePM":-1}},
    # Limit the number of documents
    {"$limit": 5},
])

In [None]:
df = pd.DataFrame(rst)
df