# Data Acquisition

## Loading Data
- Data was acquired and released by Twitter user @donk_enby

In [1]:
# File read libraries
import glob
import json

In [1]:
# Note: output cleared to remove identifying information

# Create list of filenames in metadata directory
filenames = glob.glob('metadata/*.json')
print('Number of json files:', len(filenames))
print('Sample filename:', filenames[0])

In [3]:
# Function to load file and read json as Python dictionary
def get_json(filename):
    path = filename
    with open(path, 'r') as f:
        j = json.load(f)[0]
    return j

# Function to get number of keys in document
def get_key_len(filename):
    l = len(get_json(filename).keys())
    return l

In [4]:
# Check if metadata is mutable
keys = get_key_len(filenames[0])
l = len(filenames)
for i in range(1, l):
    current = get_key_len(filenames[i])
    assert current == keys, f'{current} is not equal to {keys}'

AssertionError: 61 is not equal to 85

### Notes:
- The metadata is not consistent (varying number of keys) it might be best to use a NoSQL DB
- MongoDB is a document NoSQL DB that can store JSON-like data

## Create MongoDB Database

In [10]:
# Official MongoDB driver
import pymongo
from pymongo import MongoClient

# Regex
import re

In [7]:
# Instantiate client connection
client = MongoClient()

# Set db
db = client.parler_db

# Set collection
collection = db.metadata

In [19]:
# Function to get unique metadata id
def get_id(filename):
    _id = re.search(r'(?<=-)(.*?)(?=.json)', filename)[0]
    return _id

In [21]:
# Read and insert metadata into database
for filename in filenames:
    j = get_json(filename)
    j['_id'] = get_id(filename)
    collection.insert_one(j)

In [29]:
# Check number of filenames
print('JSON files to insert:', len(filenames))

# Check number of documents inserted
print('Documents in MongoDB:', collection.count_documents({}))

JSON files to insert: 1032523
Documents in MongoDB: 1032523


## GPS Data

In [174]:
# For Bulk Write operations
from pymongo import UpdateOne

In [50]:
# Get a sample GPS coordinate
sample = collection.find_one()
print('Lat:', sample['GPSLatitude'])
print('Long:', sample['GPSLongitude'])

Lat: 33 deg 29' 25.80" N
Long: 117 deg 40' 5.88" W


In [167]:
# Function to convert DMS to decimal
def dms2dec(dms):
    deg = float(re.search(r'(.*?)(?=deg)', dms)[0].strip())
    minute = float(re.search(r'(?<=deg)(.*?)(?=\')', dms)[0].strip()) / 60
    second = float(re.search(r'(?<=\')(.*?)(?=")', dms)[0].strip()) / 3600
    card = re.search(r'(?<=")(.*)', dms)[0].strip()
    dec = deg + minute + second
    if card in ['S', 'W']:
        dec = -dec
    return dec

# Function to convert lat, long to decimal
def gps_dec(doc):
    lat = doc['GPSLatitude']
    long = doc['GPSLongitude']
    try:
        lat = round(dms2dec(lat), 4)
    except:
        lat = None
        
    try:
        long = round(dms2dec(long), 4)
    except:
        long = None
        
    return [long, lat]

In [168]:
gps_dec(sample)

[-117.6683, 33.4905]

In [169]:
gps_lat = collection.count_documents({'GPSLatitude': {'$exists': True}})
print('Docs with GPS Lat:', gps_lat)

Docs with GPS Lat: 68463


In [180]:
# Get all docs with GPS data
docs = collection.find({'GPSLatitude': {'$exists': True}})

In [171]:
# Test
x = sample['_id']
coord = gps_dec(sample)
result = collection.update_one(
    {'_id': x},
    {'$set': {'geolocation': coord}}
)

In [172]:
# Cursor result
result.raw_result

{'n': 1, 'nModified': 0, 'ok': 1.0, 'updatedExisting': True}

In [3]:
# Note: output cleared to remove identifying information

# Field set in document
collection.find_one({'_id': x}, {'geolocation': 1})

In [181]:
# List of bulk write operations
requests = [
    UpdateOne({'_id': doc['_id']}, {'$set': {'geolocation': gps_dec(doc)}})
    for doc in docs
]

In [2]:
# Note: output cleared to remove identifying information

# Samples
print(requests[0])
print(requests[-1])

In [186]:
# Bulk Write
result = collection.bulk_write(requests, ordered=False)

In [189]:
# Cursor result
result.bulk_api_result

{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 68463,
 'nModified': 68462,
 'nRemoved': 0,
 'upserted': []}