In [1]:
import pandas as pd
import sqlite3
import os

In [4]:
# Define the path to the database file
db_path = "../iNaturalist/data/db/inat_open_data.sq3db"

# Create the URI connection string with read-only mode
uri = f"file:{db_path}?mode=ro"

# Attempt to connect to the database in read-only mode
try:
    conn = sqlite3.connect(uri, uri=True)
    print("Database connected successfully.")
except sqlite3.OperationalError as e:
    print(f"Error: {e}")
    print("The database does not exist or cannot be opened in read-only mode.")

sq_query = """
SELECT * 
FROM taxa
WHERE name = "Araneae" 
LIMIT 10
"""

df = pd.read_sql(
    sq_query,
    conn
)
df


Database connected successfully.


Unnamed: 0,taxon_id,ancestry,rank_level,rank,name,active
0,47118,48460/1/47120/245097/47119,40.0,order,Araneae,1


In [3]:
spidersDF = pd.read_csv("../data/csvs/spider_urls.csv")
spidersDF

Unnamed: 0,observer_id,login,name,photo_url,taxon_name,ancestry,taxon_id,rank,photo_id,photo_uuid,extension
0,505,dloarie,Don Loarie,http://inaturalist-open-data.s3.amazonaws.com/...,Steatoda grossa,48460/1/47120/245097/47119/47118/120474/342614...,61997,species,22295,12e6ab9f-51cf-4b58-bc11-53d709a3ff96,JPG
1,505,dloarie,Don Loarie,http://inaturalist-open-data.s3.amazonaws.com/...,Steatoda grossa,48460/1/47120/245097/47119/47118/120474/342614...,61997,species,22296,77d68fba-8b99-47a7-83e9-f38d588b4623,jpg
2,1,kueda,Ken-ichi Ueda,http://inaturalist-open-data.s3.amazonaws.com/...,Alopecosa kochi,48460/1/47120/245097/47119/47118/120474/342614...,143470,species,22618,47306769-93a6-46b2-8ef5-247fef2fe19f,jpg
3,505,dloarie,Don Loarie,http://inaturalist-open-data.s3.amazonaws.com/...,Calisoga longitarsis,48460/1/47120/245097/47119/47118/84719/1467321...,50989,species,22839,d261f084-6510-47fd-8f3e-ecaa2ec06d71,jpg
4,505,dloarie,Don Loarie,http://inaturalist-open-data.s3.amazonaws.com/...,Arctosa littoralis,48460/1/47120/245097/47119/47118/120474/342614...,67712,species,22840,a2209413-c6a5-476b-b6e2-0afea5a46237,jpg
...,...,...,...,...,...,...,...,...,...,...,...
2451461,6416295,tohrus,Laine Romain,http://inaturalist-open-data.s3.amazonaws.com/...,Zilla diodia,48460/1/47120/245097/47119/47118/120474/342614...,486472,species,386783110,4c7e0943-f4e9-49de-b33a-456a8d6f8a67,jpg
2451462,6416295,tohrus,Laine Romain,http://inaturalist-open-data.s3.amazonaws.com/...,Zilla diodia,48460/1/47120/245097/47119/47118/120474/342614...,486472,species,386783135,4bdb1ba7-49c0-4902-b8ec-684e449abbcf,jpg
2451463,1144503,davidgeorge,David George,http://inaturalist-open-data.s3.amazonaws.com/...,Dolomedes albineus,48460/1/47120/245097/47119/47118/120474/342614...,362543,species,386783530,a0befe8b-6048-44bd-bfc9-cd9306c77f96,jpeg
2451464,4189735,wonderbearruzka,Andy Wolfe,http://inaturalist-open-data.s3.amazonaws.com/...,Dolomedes tenebrosus,48460/1/47120/245097/47119/47118/120474/342614...,82117,species,386784154,564b83b8-827c-4b8e-95c1-fe4f56cfb8bf,jpg


In [4]:
countDf = spidersDF.groupby('taxon_name').size().reset_index(name='count').sort_values(by='count', ascending=False)
topSpecies = countDf[0:20]
topSpecies


Unnamed: 0,taxon_name,count
575,Araneus diadematus,77315
5454,Phidippus audax,73085
710,Argiope aurantia,60106
4436,Misumena vatia,44611
6197,Salticus scenicus,40350
7197,Trichonephila clavipes,36679
5772,Pisaura mirabilis,35069
5804,Platycryptus undatus,34145
2542,Gasteracantha cancriformis,32534
709,Argiope argentata,31746


In [8]:
filteredDf = spidersDF[spidersDF['taxon_name'].isin(topSpecies['taxon_name'])]
spiderTrainDf = filteredDf.groupby('taxon_name').apply(lambda x: x.head(1000)).reset_index(drop=True)
spiderTrainDf

Unnamed: 0,observer_id,login,name,photo_url,taxon_name,ancestry,taxon_id,rank,photo_id,photo_uuid,extension
0,873,tapbirds,Scott Cox,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,34568,c822cb62-cd40-4417-9c82-6fe6cd7b9484,jpg
1,3161,greenrosettas,Chris Cook,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,62248,87999043-a1ee-454a-bc96-43ebb326dbf0,JPG
2,3161,greenrosettas,Chris Cook,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,62249,1a44ee1f-8cc2-4e54-8528-94698abd573f,JPG
3,1620,cyric,,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,64633,c867fba7-dc58-43f0-ba27-fe3392aa4bc1,jpg
4,357,annetanne,,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,9354,d60fc711-48ae-46d6-a933-8decd02ddcf2,jpg
...,...,...,...,...,...,...,...,...,...,...,...
19995,153098,fabbyg,,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6552608,f064a167-1711-479d-a252-dbe0240b8fa3,jpg
19996,285603,rmcminds,Ryan McMinds,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6558130,6812ed28-573a-4235-87e3-1d853c3195ac,jpg
19997,308396,armando_forest_al13,Forest_13,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6564928,91373588-59b4-4fb3-90c4-a1881b46eef1,jpeg
19998,308396,armando_forest_al13,Forest_13,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6564973,8d665386-4b92-4349-ab6d-0139ce11743b,jpeg


In [9]:
spiderTrainDf.to_csv("spiderTraining1000.csv")

In [11]:
%%bash
time python ../scripts/ImgDownload.py --input_path spiderTraining1000.csv --output_folder training20-1000 --url_column photo_url --name_column taxon_name


100%|██████████| 20000/20000 [00:44<00:00, 452.58it/s]


Completed with 3 errors.



real	0m47.212s
user	0m34.102s
sys	0m25.360s
