In [2]:
import pandas as pd
import sqlite3
import os

In [4]:
# Define the path to the database file
db_path = "../data/db/inat_open_data.sq3db"

# Create the URI connection string with read-only mode
uri = f"file:{db_path}?mode=ro"

# Attempt to connect to the database in read-only mode
try:
    conn = sqlite3.connect(uri, uri=True)
    print("Database connected successfully.")
except sqlite3.OperationalError as e:
    print(f"Error: {e}")
    print("The database does not exist or cannot be opened in read-only mode.")

sq_query = """
SELECT * 
FROM taxa
WHERE name = "Culicidae" 
LIMIT 10
"""

df = pd.read_sql(
    sq_query,
    conn
)
df

sql_query = """
WITH urls AS (SELECT 'http://inaturalist-open-data.s3.amazonaws.com/photos/' AS photo) 
SELECT B.*, U.photo || A.photo_id || '/' || 'medium.' || A.extension AS photo_url, D.name AS taxon_name, D.ancestry, D.taxon_id, D.rank, A.photo_id, A.photo_uuid, A.extension 
FROM urls U 
CROSS JOIN photos A 
JOIN observers B 
ON A.observer_id = B.observer_id 
JOIN observations C 
ON A.observation_uuid = C.observation_uuid 
LEFT JOIN taxa D 
ON C.taxon_id = D.taxon_id 
WHERE (D.ancestry LIKE '%"""+str(df['taxon_id'][0])+"""%' AND C.quality_grade = 'research' AND D.active = 1 AND D.rank = 'species')
"""

print(sql_query)

url_df = pd.read_sql_query(
    sql_query,
    conn
)

url_df

url_df.to_csv("../data/csv/mosquito_url.csv", index=False)

Database connected successfully.

WITH urls AS (SELECT 'http://inaturalist-open-data.s3.amazonaws.com/photos/' AS photo) 
SELECT B.*, U.photo || A.photo_id || '/' || 'medium.' || A.extension AS photo_url, D.name AS taxon_name, D.ancestry, D.taxon_id, D.rank, A.photo_id, A.photo_uuid, A.extension 
FROM urls U 
CROSS JOIN photos A 
JOIN observers B 
ON A.observer_id = B.observer_id 
JOIN observations C 
ON A.observation_uuid = C.observation_uuid 
LEFT JOIN taxa D 
ON C.taxon_id = D.taxon_id 
WHERE (D.ancestry LIKE '%52134%' AND C.quality_grade = 'research' AND D.active = 1 AND D.rank = 'species')



In [6]:
mosq_df = pd.read_csv("../data/csv/mosquito_url.csv")
mosq_df

Unnamed: 0,observer_id,login,name,photo_url,taxon_name,ancestry,taxon_id,rank,photo_id,photo_uuid,extension
0,533,vanhoutan,Kyle Van Houtan,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes albopictus,48460/1/47120/372739/47158/184884/47822/154259...,62984,species,42868,04dd66de-89ab-4d97-89e2-c39f55f7a747,jpg
1,169,flapack,,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes albopictus,48460/1/47120/372739/47158/184884/47822/154259...,62984,species,18278,4d4c3c87-b7a9-4046-bb3f-ae4e04695709,jpg
2,3847,rpayne,Ron Payne,http://inaturalist-open-data.s3.amazonaws.com/...,Anopheles punctipennis,48460/1/47120/372739/47158/184884/47822/154259...,213719,species,92260,5312ef2b-771a-46b1-9198-a58a6cc46dff,jpg
3,3785,treegrow,Katja Schulz,http://inaturalist-open-data.s3.amazonaws.com/...,Psorophora ferox,48460/1/47120/372739/47158/184884/47822/154259...,132756,species,141498,a35fdf55-66e2-4b1d-9b45-17e561d9c381,jpg
4,3785,treegrow,Katja Schulz,http://inaturalist-open-data.s3.amazonaws.com/...,Psorophora ferox,48460/1/47120/372739/47158/184884/47822/154259...,132756,species,143675,d5fcf478-4c8f-4d25-84bf-355785e93eac,jpg
...,...,...,...,...,...,...,...,...,...,...,...
58314,1037709,apistopanchax,Josh Emm,http://inaturalist-open-data.s3.amazonaws.com/...,Psorophora ciliata,48460/1/47120/372739/47158/184884/47822/154259...,259131,species,435647984,f3aacb9a-1331-4cb3-9bd1-177af89879b7,jpeg
58315,4251266,wet_specimen,hagfish,http://inaturalist-open-data.s3.amazonaws.com/...,Psorophora ferox,48460/1/47120/372739/47158/184884/47822/154259...,132756,species,435814342,d5688006-ceee-4fbf-98be-5b1a574061e7,jpeg
58316,6932624,fishmansf4,Sean Ford,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes japonicus,48460/1/47120/372739/47158/184884/47822/154259...,1112094,species,435672608,01cc3af1-eed7-4dc4-89c4-1233cd4c8c21,jpeg
58317,6932624,fishmansf4,Sean Ford,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes japonicus,48460/1/47120/372739/47158/184884/47822/154259...,1112094,species,435672616,4dde5a80-5056-4ec5-90fa-240c105ea144,jpeg


In [8]:
countDf = mosq_df.groupby('taxon_name').size().reset_index(name='count').sort_values(by='count', ascending=False)
topSpecies = countDf[0:30]
topSpecies


Unnamed: 0,taxon_name,count
9,Aedes albopictus,10662
5,Aedes aegypti,5682
279,Psorophora ciliata,3443
136,Aedes vexans,2269
89,Aedes notoscriptus,2258
286,Psorophora ferox,1977
70,Aedes japonicus,1975
320,Toxorhynchites rutilus,1896
228,Culex quinquefasciatus,1541
322,Toxorhynchites speciosus,1324


In [9]:
topSpecies["count"] / 100

9      106.62
5       56.82
279     34.43
136     22.69
89      22.58
286     19.77
70      19.75
320     18.96
228     15.41
322     13.24
242     13.22
180     13.02
196     12.37
117     11.67
133     10.65
283      9.59
134      9.01
281      7.52
8        6.24
247      5.95
Name: count, dtype: float64

In [9]:
import pandas as pd

# Assuming `df` is your original dataframe
# Example: df = pd.read_csv("your_data.csv")

# Step 1: Identify the top 10 most represented `taxon_name`
top_10_taxa = (
    mosq_df['taxon_name']
    .value_counts()
    .head(30)
    .index.tolist()
)

# Step 2: Filter the dataframe to include only the top 10 taxa
top_10_df = mosq_df[mosq_df['taxon_name'].isin(top_10_taxa)]

# Step 3: Proportional sampling by dividing each species count by 100
def sample_proportional(group):
    sample_size = max(1, len(group) // 100)  # Ensure at least one row is sampled
    return group.sample(n=sample_size, random_state=42)  # Random but reproducible

# Apply the proportional sampling
top_10_sampled_df = (
    top_10_df.groupby('taxon_name', group_keys=False)
    .apply(sample_proportional)
)

# Step 4: Reset index for the new dataframe (optional)
top_10_sampled_df.reset_index(drop=True, inplace=True)

# Example output: Save to a new CSV or view the result
# top_10_sampled_df.to_csv("top_10_sampled.csv", index=False)
top_10_sampled_df


Unnamed: 0,observer_id,login,name,photo_url,taxon_name,ancestry,taxon_id,rank,photo_id,photo_uuid,extension
0,5123391,lgarriga,Lucas Garriga,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes aegypti,48460/1/47120/372739/47158/184884/47822/154259...,155453,species,260271777,13e7a968-fa94-4e9e-8b4c-24087ab5d934,jpeg
1,2585297,jurisdiccion_huejutla,Jurisdiccion_Huejutla,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes aegypti,48460/1/47120/372739/47158/184884/47822/154259...,155453,species,93144173,7695b6b4-e227-45b4-880e-1550fd5d1933,jpg
2,4573202,gatorhawk,Josiah Londerée,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes aegypti,48460/1/47120/372739/47158/184884/47822/154259...,155453,species,239803722,3dc30f44-91e5-403a-b960-e83ce11f9dbc,jpeg
3,231635,waimeamiddleschool,,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes aegypti,48460/1/47120/372739/47158/184884/47822/154259...,155453,species,61048070,dfbacf92-b289-419c-997f-b922f9cde261,jpg
4,6532702,fer_mtz05,,http://inaturalist-open-data.s3.amazonaws.com/...,Aedes aegypti,48460/1/47120/372739/47158/184884/47822/154259...,155453,species,258328322,276c0f47-980a-4afe-ab9c-52b46141577b,jpg
...,...,...,...,...,...,...,...,...,...,...,...
457,1390240,aynature,,http://inaturalist-open-data.s3.amazonaws.com/...,Toxorhynchites speciosus,48460/1/47120/372739/47158/184884/47822/154259...,460871,species,259777837,b120821b-3085-432e-8c9e-d6e618a99d52,jpeg
458,2702713,juliesarna,,http://inaturalist-open-data.s3.amazonaws.com/...,Toxorhynchites speciosus,48460/1/47120/372739/47158/184884/47822/154259...,460871,species,176456341,495d5d41-a0b3-4d2e-8730-e9ec599add71,jpg
459,7657489,gina_jones_explorer,Gina and Bella The Explorers,http://inaturalist-open-data.s3.amazonaws.com/...,Toxorhynchites speciosus,48460/1/47120/372739/47158/184884/47822/154259...,460871,species,348359298,aa399341-2091-4fbc-b69c-ae6742917553,jpeg
460,2329106,cesdamess,,http://inaturalist-open-data.s3.amazonaws.com/...,Toxorhynchites speciosus,48460/1/47120/372739/47158/184884/47822/154259...,460871,species,165079992,2a1aa75a-9816-4b0b-b57b-b73ba82f950e,jpeg


In [10]:
top_10_sampled_df.to_csv("../data/csv/longtailed_mosq30.csv")

In [13]:
%%bash
time python ../bin/ImgDownload.py --input_path ../data/csv/longtailed_mosq30.csv --output_folder ../data/datasets/longtailed_mosq30s --url_column photo_url --name_column taxon_name


100%|██████████| 462/462 [00:02<00:00, 229.79it/s]


Completed with 0 errors.



real	0m2.269s
user	0m0.276s
sys	0m0.753s


In [16]:
from huggingface_hub import HfApi, Repository, login
import os
import csv

In [17]:
# Log in to the Hugging Face Hub
repo_name = "zkdeng/top30mosq-300"
local_dir = "longtailed_mosq30"
csv_file_path = "longtailed_mosq.csv"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/zideng/.cache/huggingface/token
Login successful


In [19]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir='../data/dataset/longtailed_mosq30')

Resolving data files:   0%|          | 0/462 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/462 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
dataset.push_to_hub("zkdeng/top30mosq-300")

Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

In [8]:
# filteredDf = mosq_df[mosq_df['taxon_name'].isin(topSpecies['taxon_name'])]
# spiderTrainDf = filteredDf.groupby('taxon_name').apply(lambda x: x.head(1000)).reset_index(drop=True)
# spiderTrainDf

Unnamed: 0,observer_id,login,name,photo_url,taxon_name,ancestry,taxon_id,rank,photo_id,photo_uuid,extension
0,873,tapbirds,Scott Cox,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,34568,c822cb62-cd40-4417-9c82-6fe6cd7b9484,jpg
1,3161,greenrosettas,Chris Cook,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,62248,87999043-a1ee-454a-bc96-43ebb326dbf0,JPG
2,3161,greenrosettas,Chris Cook,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,62249,1a44ee1f-8cc2-4e54-8528-94698abd573f,JPG
3,1620,cyric,,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,64633,c867fba7-dc58-43f0-ba27-fe3392aa4bc1,jpg
4,357,annetanne,,http://inaturalist-open-data.s3.amazonaws.com/...,Araneus diadematus,48460/1/47120/245097/47119/47118/120474/342614...,52628,species,9354,d60fc711-48ae-46d6-a933-8decd02ddcf2,jpg
...,...,...,...,...,...,...,...,...,...,...,...
19995,153098,fabbyg,,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6552608,f064a167-1711-479d-a252-dbe0240b8fa3,jpg
19996,285603,rmcminds,Ryan McMinds,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6558130,6812ed28-573a-4235-87e3-1d853c3195ac,jpg
19997,308396,armando_forest_al13,Forest_13,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6564928,91373588-59b4-4fb3-90c4-a1881b46eef1,jpeg
19998,308396,armando_forest_al13,Forest_13,http://inaturalist-open-data.s3.amazonaws.com/...,Trichonephila clavipes,48460/1/47120/245097/47119/47118/120474/342614...,904336,species,6564973,8d665386-4b92-4349-ab6d-0139ce11743b,jpeg
