In [1]:
import requests
import json
import codecs
from bs4 import BeautifulSoup
import time
import re
import os 
from os import listdir
from os.path import isfile, join
import datetime
import pprint

import pymongo
from pymongo import MongoClient
from bson.code import Code

In [2]:
# Set up the mongodb connection
# Time zone difference (in the morning)
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["AQICN"]
mycol = mydb["cities2"]

## Data collection process

In [3]:
def clean(text):
    sub1 = re.sub('[\[\]]','', str(text))
    sub2 = re.sub('</?[^>]*?>','', sub1)
    if sub2 == '-':
        sub2 = 0
    return sub2

In [4]:
def collect_info(city):
    x = datetime.datetime.now() # get the date of the data
    day = x.strftime("%Y") + '-' + x.strftime("%m") + '-' + x.strftime("%d")
    cap_city = i.title()
    
    coordinates = my_dict[i]
    
    # scrap air quality from the website
    city = city
    URL = "http://aqicn.org/city/" + city
    user_agent = {'User-agent': 'Mozilla/5.0'} 
    page = requests.get(URL, user_agent)
    doc = BeautifulSoup(page.content, "html.parser")
    
    pm25 = doc.select("td[id^=cur_pm25]")
    o3 = doc.select("td[id^=cur_o3]")
    co = doc.select("td[id^=cur_co]")
    
    # Weather info for the day
    API_key = '7de6d658fa1c42bda1882604211203'

    url = 'http://api.weatherapi.com/v1/history.json?key='+ API_key + '&q=' + cap_city + '&dt=' + day
    page2 = requests.get(url)
    doc2 = BeautifulSoup(page2.content, 'html.parser')
    docN = json.loads(str(doc2))
    
    maxtemp_c = docN['forecast']['forecastday'][0]['day']['maxtemp_c']
    mintemp_c = docN['forecast']['forecastday'][0]['day']['mintemp_c']
    avgtemp_c = docN['forecast']['forecastday'][0]['day']['avgtemp_c']
    maxwind_kph = docN['forecast']['forecastday'][0]['day']['maxwind_kph']
    totalprecip_mm = docN['forecast']['forecastday'][0]['day']['totalprecip_mm']
    avghumidity = docN['forecast']['forecastday'][0]['day']['avghumidity']
    
    # Dict
    city_dict = {'name': city, 'coordinates': coordinates} 
    
    day_dict = {'city': city_dict,
                'date': day,
                'pm25': int(clean(pm25)),
                'o3': int(clean(o3)),
                'co': int(clean(co)),
                'maxtemp_c': maxtemp_c,
                'mintemp_c': mintemp_c,
                'avgtemp_c': avgtemp_c,
                'maxwind_kph': maxwind_kph, 
                'totalprecip_mm': totalprecip_mm,
                'avghumidity': avghumidity}
    
    # Save to Mongodb
    mycol.insert_one(day_dict)

In [5]:
# Get geographic info from Cloud API
cities = ['shanghai','beijing','chongqing']

my_dict = { } 
token = "0931488b1a26be1810b012b19538919778762329"
for i in cities:
    time.sleep(5) # 5 second pause between queries
    url = "http://api.waqi.info/feed/" + str(i) + "/?token=" + token
    print(url)
    page = requests.get(url)
    doc = BeautifulSoup(page.content, 'html.parser')
    json_dict = json.loads(str(doc)) # Parse the JSON strings to an internal Python object 
    geo_info = json_dict['data']['city']['geo']
    my_dict[i] = geo_info

http://api.waqi.info/feed/shanghai/?token=0931488b1a26be1810b012b19538919778762329
http://api.waqi.info/feed/beijing/?token=0931488b1a26be1810b012b19538919778762329
http://api.waqi.info/feed/chongqing/?token=0931488b1a26be1810b012b19538919778762329


In [11]:
# Run this for 7 consecutive days
for i in cities:
    collect_info(i)

In [50]:
# Creating index on the coordinates
mydb.cities.create_index([('coordinates', "2dsphere")] )

'coordinates_2dsphere'

## Data analysis process

In [12]:
# Test to see one object
a = mycol.find_one()
pprint.pprint(a, width = 2,sort_dicts=False)

# A list of all the keys
mycol_keys = ['_id', 'city', 'coordinates' ,'date', 'pm25', 'o3', 'co', 'maxtemp_c', 'mintemp_c', 'avgtemp_c', 'maxwind_kph', 'totalprecip_mm', 'avghumidity']

{'_id': ObjectId('604d620a66fb6ee0345bacf9'),
 'city': {'name': 'shanghai',
          'coordinates': [31.2047372,
                          121.4489017]},
 'date': '2021-03-12',
 'pm25': 107,
 'o3': 23,
 'co': 8,
 'maxtemp_c': 14.1,
 'mintemp_c': 10.8,
 'avgtemp_c': 12.5,
 'maxwind_kph': 15.8,
 'totalprecip_mm': 0.0,
 'avghumidity': 75.0}


In [13]:
# Comparing the average PM2.5 across the three cities
# map reduce
db_map = Code('''
                function() { emit(this.city, this.pm25); }
               ''')

db_reduce = Code('''
    function(key, values) {
        return Array.avg(values);
    }
                  ''')

pm25_avg = mydb.cities.map_reduce(db_map, db_reduce, "pm25_avg")

# Print the first 100 entires
for i in pm25_avg.find():
    print(i)

{'_id': 'shanghai', 'value': 91.42857142857143}
{'_id': 'chongqing', 'value': 128.14285714285714}
{'_id': 'beijing', 'value': 167.85714285714286}


The average values of PM2.5 over the seven days show that Beijing has the worst air quality, and Shanghai has the best air quality within the three cities.

In [30]:
# sort pm2.5 ascending
# You can change the city's name to explore whether other features related to the city show a similar trend as the values of PM2.5
def sorted_pm25(value):
    sort_pm25 = mycol.find( { '$query': {'city.name': 'shanghai'}, '$orderby': { 'pm25' : 1 } } )
    print(value)
    for i in sort_pm25:
        print(i[value])

In [31]:
for i in mycol_keys[4:]:
    value = i
    sorted_pm25(value)
    print(' ')

pm25
67
77
84
89
105
107
111
 
o3
33
30
43
25
36
23
24
 
co
5
5
5
5
6
8
7
 
maxtemp_c
14.8
10.8
16.7
13.7
11.3
14.1
12.4
 
mintemp_c
3.1
8.3
9.0
10.1
8.6
10.8
10.1
 
avgtemp_c
12.6
9.7
14.0
12.0
10.6
12.5
11.5
 
maxwind_kph
13.7
13.0
15.1
10.1
16.6
15.8
12.2
 
totalprecip_mm
0.0
4.5
0.0
0.0
2.0
0.0
3.7
 
avghumidity
73.0
86.0
63.0
70.0
75.0
75.0
85.0
 


It is hard to tell whether PM2.5 has any relationships with the other features.
We will divide the PM2.5 values into high and low and set them as conditions for comparison.

According to https://blissair.com/what-is-pm-2-5.htm, a PM2.5 of above 150 means the air quality is unhealthy. Thus, we will use this value to divide up the PM2.5 values in our database.

### For PM2.5 greater than 150

In [32]:
# map reduce
def unhealthy(value):
    reduce = 'function() { emit(this.city, this.' + value + '); }'
    db_map_above = Code(reduce)

    db_reduce_above = Code('''
        function(key, values) {
            return Array.avg(values);
        }
                      ''')

    pm25_above = mydb.cities.map_reduce(db_map_above, db_reduce_above, "pm25_avg", query = { 'pm25': { '$gte': 150 } })

    # Print the first 100 entires
    for i in pm25_above.find():
        print(i)

### For PM2.5 less than 150

In [33]:
# map reduce
def healthier(value):
    reduce = 'function() { emit(this.city, this.' + value + '); }'
    db_map_above = Code(reduce)

    db_reduce_above = Code('''
        function(key, values) {
            return Array.avg(values);
        }
                      ''')

    pm25_above = mydb.cities.map_reduce(db_map_above, db_reduce_above, "pm25_avg", query = { 'pm25': { '$lt': 150 } })

    # Print the first 100 entires
    for i in pm25_above.find():
        print(i)

### Compare results

In [35]:
for i in mycol_keys[5:]:
    value = i
    print(value)
    unhealthy(value)
    print(' ')

o3
{'_id': 'chongqing', 'value': 23.0}
{'_id': 'beijing', 'value': 30.2}
 
co
{'_id': 'beijing', 'value': 9.0}
{'_id': 'chongqing', 'value': 11.0}
 
maxtemp_c
{'_id': 'beijing', 'value': 13.6}
{'_id': 'chongqing', 'value': 13.9}
 
mintemp_c
{'_id': 'chongqing', 'value': 12.1}
{'_id': 'beijing', 'value': 6.540000000000001}
 
avgtemp_c
{'_id': 'chongqing', 'value': 13.3}
{'_id': 'beijing', 'value': 10.48}
 
maxwind_kph
{'_id': 'beijing', 'value': 10.72}
{'_id': 'chongqing', 'value': 9.0}
 
totalprecip_mm
{'_id': 'chongqing', 'value': 0.2}
{'_id': 'beijing', 'value': 0.2}
 
avghumidity
{'_id': 'chongqing', 'value': 77.0}
{'_id': 'beijing', 'value': 47.0}
 


In [36]:
for i in mycol_keys[5:]:
    value = i
    print(value)
    healthier(value)
    print(' ')

o3
{'_id': 'shanghai', 'value': 30.571428571428573}
{'_id': 'chongqing', 'value': 24.166666666666668}
{'_id': 'beijing', 'value': 23.5}
 
co
{'_id': 'shanghai', 'value': 5.857142857142857}
{'_id': 'beijing', 'value': 6.0}
{'_id': 'chongqing', 'value': 6.666666666666667}
 
maxtemp_c
{'_id': 'shanghai', 'value': 13.400000000000002}
{'_id': 'beijing', 'value': 12.1}
{'_id': 'chongqing', 'value': 17.366666666666664}
 
mintemp_c
{'_id': 'shanghai', 'value': 8.571428571428571}
{'_id': 'beijing', 'value': 2.35}
{'_id': 'chongqing', 'value': 13.799999999999999}
 
avgtemp_c
{'_id': 'chongqing', 'value': 15.666666666666666}
{'_id': 'beijing', 'value': 7.8}
{'_id': 'shanghai', 'value': 11.842857142857143}
 
maxwind_kph
{'_id': 'chongqing', 'value': 9.116666666666667}
{'_id': 'beijing', 'value': 9.149999999999999}
{'_id': 'shanghai', 'value': 13.785714285714286}
 
totalprecip_mm
{'_id': 'shanghai', 'value': 1.457142857142857}
{'_id': 'beijing', 'value': 0.0}
{'_id': 'chongqing', 'value': 1.0666666

1. Higher average CO when the PM2.5 is high;
2. Less rain when the PM2.5 is high;
3. Higher humidity when the PM2.5 is high;
4. We thought wind would have an effect, but the difference is not obvious from the data collected.

More days of data need to be collected for more accurate analysis.