# A Look into the Formats and GPS Locations of Files

In [18]:
import numpy as np
import pandas as pd

import pymongo
from pymongo import MongoClient

In [19]:
# Connect to database
client = MongoClient()
db = client.parler_db
collection = db.metadata

In [37]:
# Fields used for columns
column_dict = {
    'CreateDate': 1,
    'FileType': 1,
    'geolocation': 1,
    'Duration': 1,
    'ImageWidth': 1,
    'ImageHeight': 1,
    '_id': 0, # Remove identifying information
}

# Exists dictionary
exists_dict = {
    k: {'$exists': True}
    for k, _ in column_dict.items()
    if k != '_id'
}

In [38]:
# Count docs 'geolocation' (GPS in decimal form)
collection.count_documents(exists_dict)

68454

In [46]:
# Run query
result = collection.find(
    {'geolocation': {'$exists': True}},
    column_dict
)
result.count()

  result.count()


68463

## EDA

In [47]:
# Create DataFrame from cursor results
df = pd.DataFrame(result)

In [50]:
df.head(3)

Unnamed: 0,FileType,CreateDate,Duration,ImageWidth,ImageHeight,geolocation
0,MOV,2020:07:25 17:00:40,0:03:55,1920,1080,"[-117.6683, 33.4905]"
1,MP4,2020:06:10 09:30:48,0:05:20,1280,660,"[-1.3391, 52.0465]"
2,MOV,2021:01:01 06:04:07,12.00 s,1920,1080,"[-97.3518, 27.6804]"


In [48]:
df['FileType'].value_counts()

MOV     50509
MP4     16865
3GP      1080
JPEG        8
AVI         1
Name: FileType, dtype: int64

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68463 entries, 0 to 68462
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   FileType     68463 non-null  object
 1   CreateDate   68460 non-null  object
 2   Duration     68455 non-null  object
 3   ImageWidth   68463 non-null  int64 
 4   ImageHeight  68463 non-null  int64 
 5   geolocation  68463 non-null  object
dtypes: int64(2), object(4)
memory usage: 3.1+ MB


### Notes:
- MOV: QuickTime format, Apple iOS
- 3GP: Non-Apple 3G phones
- MP4: Non-Apple, likely Android
- JPEG: Still image
- AVI: Microsoft format

In [53]:
# Dropna, minimal data loss
df.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68454 entries, 0 to 68462
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   FileType     68454 non-null  object
 1   CreateDate   68454 non-null  object
 2   Duration     68454 non-null  object
 3   ImageWidth   68454 non-null  int64 
 4   ImageHeight  68454 non-null  int64 
 5   geolocation  68454 non-null  object
dtypes: int64(2), object(4)
memory usage: 3.7+ MB
