In [62]:
from pyspark.sql import SparkSession

In [63]:
spark = SparkSession.builder.appName('data_process').getOrCreate()

In [64]:
df_camera = spark.read.csv("deployment_metadata_20201014181326.csv", header=True)
df_image = spark.read.csv("sianctapi-selected-observations-5f87781821a06.csv", header=True)

In [65]:
df_camera_data = df_camera[["deployment_id", "actual_lat", "actual_long"]]
df_image_data = df_image[["Deployment ID", "Sequence ID", "End Time", "Species Name", "Common Name", "Age", "Sex"]]

In [66]:
del_species_names = [
    "Camera Trapper" , "No Animal", "Other Bird species", "Unkown Animal",
    "Unknown Cervid", "Vehicle", "Homo sapiens", "Unkown Canid",
    "Unkown Bear", "Unknown Small Rodent", "Unknown Rabbit_Hare",
    "Unknown Samll Weasel"
]

In [67]:
df_useful_images = df_image_data.filter(~df_image_data["Species Name"].isin(del_species_names))
camera_id = list(df_useful_images[["Deployment ID"]].distinct().toPandas()["Deployment ID"])
df_useful_cameras = df_camera_data.filter(df_camera_data["deployment_id"].isin(camera_id))

In [68]:
df_combined_data = df_useful_images.join(df_useful_cameras,
                                         df_useful_images["Deployment ID"]\
                                             ==df_useful_cameras["deployment_id"],
                                         how = "left")

In [69]:
df_combined_data.show(5)

+-------------+-----------+-------------------+--------------------+-----------------+-------+-------+-------------+----------+-----------+
|Deployment ID|Sequence ID|           End Time|        Species Name|      Common Name|    Age|    Sex|deployment_id|actual_lat|actual_long|
+-------------+-----------+-------------------+--------------------+-----------------+-------+-------+-------------+----------+-----------+
|       d46071|  d46071s10|2018-06-19T19:03:30|      Cervus elaphus| Elk aka Red Deer|  Adult| Female|       d46071|  47.04886| -113.23121|
|       d46071| d46071s100|2018-07-13T05:19:57|      Cervus elaphus| Elk aka Red Deer|Unknown|Unknown|       d46071|  47.04886| -113.23121|
|       d46071| d46071s107|2018-07-25T11:25:28|Odocoileus virgin...|White-tailed Deer|  Adult|   Male|       d46071|  47.04886| -113.23121|
|       d46071| d46071s108|2018-07-25T13:49:59|Odocoileus virgin...|White-tailed Deer|  Adult|   Male|       d46071|  47.04886| -113.23121|
|       d46071| d460

In [47]:
import requests
import json

In [48]:
url = "https://dsci551-project.firebaseio.com/project.json"

In [49]:
image_dict = {}
for row in df_combined_data.rdd.collect():
    idx_dict = {}
    idx_dict["ID"] = row["Sequence ID"]
    idx_dict["time"] = row["End Time"]
    idx_dict["formal name"] = row["Species Name"]
    idx_dict["common name"] = row["Common Name"]
    idx_dict["age"] = row["Age"]
    idx_dict["sex"] = row["Sex"]
    camera = {}
    camera["ID"] = row["Deployment ID"]
    camera["lat"] = row["actual_lat"]
    camera["long"] = row["actual_long"]
    idx_dict["camera_info"] = camera
    image_dict[row["Sequence ID"]] = idx_dict

In [25]:
response = requests.put(url, json.dumps(image_dict))

In [24]:
file_path = "project_data.json"
with open(file_path, "w") as f:
    json.dump(image_dict, f)

In [57]:
spark.stop()

### Save all species names (both formal and common) in a txt file

In [58]:
age = []
for i in image_dict.items():
    a = i[1]["age"]
    if a not in age:
        age.append(a)

In [59]:
age

['Adult', 'Unknown', 'Juvenile']

In [70]:
df_useful_cameras.show()

+-------------+----------+-----------+
|deployment_id|actual_lat|actual_long|
+-------------+----------+-----------+
|       d46071|  47.04886| -113.23121|
|       d46072|  47.06821| -113.26897|
|       d47100|  47.09457| -113.23200|
|       d46069|  47.07061| -113.24006|
|       d66450|  47.09565| -113.20647|
|       d46074|  47.06059| -113.26067|
|       d44312|  47.06399| -113.25818|
|       d66455|  47.09335| -113.23933|
|       d66441|  47.10624| -113.19959|
|       d33769|  47.06470| -113.24599|
|       d33761|  47.10530| -113.23927|
|       d33751|  47.07180| -113.23661|
|       d33758|  47.10150| -113.22168|
|       d33760|  47.09970| -113.23070|
|       d33759|  47.09350| -113.23580|
|       d33770|  47.07660| -113.26166|
|       d33762|  47.09540| -113.22334|
|       d33752|  47.06780| -113.22416|
|       d33753|  47.06610| -113.23476|
|       d33749|  47.07590| -113.25140|
+-------------+----------+-----------+
only showing top 20 rows



In [54]:
species_list = []
for i in image_dict.items():
    formal_name = i[1]["formal name"]
    common_name = i[1]["common name"]
    
    if formal_name not in species_list:
        species_list.append(formal_name)
        
    if common_name not in species_list:
        species_list.append(common_name)

In [56]:
with open('species_name.txt', 'w') as filehandle:
    for listitem in species_list:
        filehandle.write('%s\n' % listitem)