In [1]:
!pip install pymongo




In [2]:
!pip install pymongo[srv]




**Import Necessary Libraries**

In [3]:
import pymongo
import json
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**Function to save request data into json file on computer**

In [4]:
def get_bike_data_store_json(destination_filename):
    """
    This function takes a destination file path including the file name, performs a request to the 
    CitiBikes server, converts the stations into a python dictionary, and writes the dictionary into the output json file
    """
    response = requests.get("https://gbfs.citibikenyc.com/gbfs/en/station_status.json")
    stations = json.loads(response.content.decode('utf-8'))['data']['stations']
    with open(destination_filename, "w") as write_file:
        json.dump(stations, write_file)

In [5]:
#get_bike_data_store_json('citibike1.json') #2:53pm 3/12/21

In [6]:
#get_bike_data_store_json('citibike2.json') #7:00pm 3/12/21

In [7]:
#get_bike_data_store_json('citibike3.json') #~7:30am 3/13/21

**Function to extract json data from json source file**

In [8]:
def get_json_data_from_file(source_filename):
    """
    Given a source file path of json format, this function returns the json data in the form of a python dictionary
    """
    with open(source_filename, "r") as read_file:
        data = json.load(read_file)
    return data

In [9]:
stations1 = get_json_data_from_file('citibike1.json')
stations2 = get_json_data_from_file('citibike2.json')
stations3 = get_json_data_from_file('citibike3.json')
pd.DataFrame(stations1)

Unnamed: 0,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys,valet,eightd_active_station_services
0,0,1,72,0,active,1615578491,25,72,30,1,0,1,False,,
1,0,1,79,0,active,1615578142,10,79,23,1,0,1,False,,
2,1,1,82,0,active,1615578634,3,82,23,1,0,1,False,,
3,0,1,83,0,active,1615578557,17,83,45,1,0,1,False,,
4,1,1,116,0,active,1615578531,29,116,20,1,0,1,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,0,0,4424,0,out_of_service,86400,0,4424,0,0,0,0,False,,
1383,2,1,4428,0,active,1615577230,24,4428,4,1,0,1,False,,
1384,1,1,4429,0,active,1615577386,22,4429,7,1,0,1,False,,
1385,0,0,4435,0,out_of_service,86400,0,4435,0,0,0,0,False,,


**Create connection to MongoDB cluster using PyMongo**

In [10]:
user = "yanbron"
password = "db_management"
conn_string = f"mongodb+srv://{user}:{password}@cluster0.foouk.mongodb.net/"

**Create db for citi bike data**

In [11]:
client = pymongo.MongoClient(conn_string)
dbnames = client.list_database_names()
print(dbnames)

if "db" in dbnames:
    print("db exists. Will be deleted...")
    client.drop_database("db")
db = client["db"]

['db', 'admin', 'local']
db exists. Will be deleted...


**Create a station collection**

In [12]:
col_names = db.list_collection_names()
print(col_names)
if "stations_col" in col_names:
    print("Stations exists. Will be deleted...")
    db.stations.drop()
station_col = db["stations_col"]

[]


**Get station data**

In [13]:
#Populate the database with every record in the stations1 data. Modify 'the last_reported' and 'num_bikes_available'
#fields to be singletons
for station in stations1:
    station['last_reported'] = list([station['last_reported']])
    station['num_bikes_available'] = list([station['num_bikes_available']])
pd.DataFrame(stations1)

Unnamed: 0,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys,valet,eightd_active_station_services
0,0,1,72,0,active,[1615578491],25,72,[30],1,0,1,False,,
1,0,1,79,0,active,[1615578142],10,79,[23],1,0,1,False,,
2,1,1,82,0,active,[1615578634],3,82,[23],1,0,1,False,,
3,0,1,83,0,active,[1615578557],17,83,[45],1,0,1,False,,
4,1,1,116,0,active,[1615578531],29,116,[20],1,0,1,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,0,0,4424,0,out_of_service,[86400],0,4424,[0],0,0,0,False,,
1383,2,1,4428,0,active,[1615577230],24,4428,[4],1,0,1,False,,
1384,1,1,4429,0,active,[1615577386],22,4429,[7],1,0,1,False,,
1385,0,0,4435,0,out_of_service,[86400],0,4435,[0],0,0,0,False,,


In [14]:
db.stations_col.delete_many({})
db.stations_col.insert_many(stations1)

<pymongo.results.InsertManyResult at 0x7fe7326905a0>

In [15]:
print(db.list_collection_names()) #Should Only appear now
from pprint import pprint

cursor = db.stations_col.find({})
pd.DataFrame(cursor)

['stations_col']


Unnamed: 0,_id,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys,valet,eightd_active_station_services
0,605e041e5f3869ec51937efb,0,1,72,0,active,[1615578491],25,72,[30],1,0,1,False,,
1,605e041e5f3869ec51937efc,0,1,79,0,active,[1615578142],10,79,[23],1,0,1,False,,
2,605e041e5f3869ec51937efd,1,1,82,0,active,[1615578634],3,82,[23],1,0,1,False,,
3,605e041e5f3869ec51937efe,0,1,83,0,active,[1615578557],17,83,[45],1,0,1,False,,
4,605e041e5f3869ec51937eff,1,1,116,0,active,[1615578531],29,116,[20],1,0,1,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,605e041e5f3869ec51938461,0,0,4424,0,out_of_service,[86400],0,4424,[0],0,0,0,False,,
1383,605e041e5f3869ec51938462,2,1,4428,0,active,[1615577230],24,4428,[4],1,0,1,False,,
1384,605e041e5f3869ec51938463,1,1,4429,0,active,[1615577386],22,4429,[7],1,0,1,False,,
1385,605e041e5f3869ec51938464,0,0,4435,0,out_of_service,[86400],0,4435,[0],0,0,0,False,,


In [16]:
def display_station_info():
    res = db.stations_col.find({"$or":[ {"station_id":'72'}, {"station_id":'119'}]})
    return pd.DataFrame(res)

In [17]:
display_station_info()

Unnamed: 0,_id,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys
0,605e041e5f3869ec51937efb,0,1,72,0,active,[1615578491],25,72,[30],1,0,1,False
1,605e041e5f3869ec51937f00,3,1,119,0,active,[1615578301],26,119,[24],1,0,1,False


In [21]:
def update_db(db, stations):
    """
    This function takes in a TinyDB() object and the json file source stations and merges the num_bikes_available data
    and 'last_reported' data
    """
    for station in stations:
        s_id = station['station_id'] #Save the current station_id of the record in stations that needs to be merged
        query = {"station_id": s_id}
        res = db.stations_col.find_one(query) #Retrieve the specific record in the db associated with the s_id
        #If res is not equal to none, append the 'last_reported' data and 'num_bikes_available' to the respective 
        #lists in res. Call db.update() to overwrite the existing 'last_reported' and 'num_bikes_available' 
        #fields with the new data
        if res:
            res['last_reported'].append(station['last_reported'])
            res['num_bikes_available'].append(station['num_bikes_available'])
            new_values = {"$set": {'last_reported': res['last_reported'], 
                                   'num_bikes_available': res['num_bikes_available']}}
            db.stations_col.update_one(query, new_values)
        #Otherwise, call insert() to insert the record into the database
        else:
            station['last_reported'] = list([station['last_reported']])
            station['num_bikes_available'] = list([station['num_bikes_available']])
            db.stations_col.insert_one(station)

In [22]:
update_db(db, stations2)

In [23]:
display_station_info()

Unnamed: 0,_id,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys
0,605e041e5f3869ec51937efb,0,1,72,0,active,"[1615578491, 1615593605]",25,72,"[30, 36]",1,0,1,False
1,605e041e5f3869ec51937f00,3,1,119,0,active,"[1615578301, 1615593902]",26,119,"[24, 27]",1,0,1,False


In [24]:
update_db(db, stations3)

In [25]:
display_station_info()

Unnamed: 0,_id,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys
0,605e041e5f3869ec51937efb,0,1,72,0,active,"[1615578491, 1615593605, 1615640345]",25,72,"[30, 36, 40]",1,0,1,False
1,605e041e5f3869ec51937f00,3,1,119,0,active,"[1615578301, 1615593902, 1615633367]",26,119,"[24, 27, 32]",1,0,1,False


In [26]:
def get_diff_list(my_list):
    """
    This function iterates through the argument list and returns a list with the absolute value of
    the differences between adjacent elements
    """
    diff_list = []
    for i in range(1,len(my_list)):
        diff_list.append(abs(my_list[i] - my_list[i-1]))
    return diff_list   

In [29]:
def update_activity(db):
    """
    This function uses db.update() to create an activity column in the database. 
    For each item currently in the database, the 'num_bikes_available' and 'station_id' fields are 
    extracted in order to specify where to create the updated field of the record and what argument list to 
    pass to get_diff_list() to populate said activity column
    """
    for document in db.stations_col.find():
        num_bikes_avail = document['num_bikes_available']
        s_id = document['station_id']
        query = {"station_id": s_id}
        new_values = {"$set": {"activity": get_diff_list(num_bikes_avail)}}          
        db.stations_col.update_one(query, new_values)


In [30]:
update_activity(db)

In [31]:
display_station_info()

Unnamed: 0,_id,num_bikes_disabled,is_returning,legacy_id,num_ebikes_available,station_status,last_reported,num_docks_available,station_id,num_bikes_available,is_renting,num_docks_disabled,is_installed,eightd_has_available_keys,activity
0,605e041e5f3869ec51937efb,0,1,72,0,active,"[1615578491, 1615593605, 1615640345]",25,72,"[30, 36, 40]",1,0,1,False,"[6, 4]"
1,605e041e5f3869ec51937f00,3,1,119,0,active,"[1615578301, 1615593902, 1615633367]",26,119,"[24, 27, 32]",1,0,1,False,"[3, 5]"
