In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
tqdm.pandas()

In [4]:
df1 = pd.read_csv(
    Path(os.getenv("AUTOFOCUS_DATA_DIR")) / "lpz_2012-2014" / "raw" / "labels.csv",
    parse_dates=["Date"],
    usecols=["Date", "Species", "filename_build"],
)

In [5]:
df1.head()

Unnamed: 0,Date,Species,filename_build
0,2012-01-24,Coyote,WI12/DPT/D10-VHC1-WI12/D10-VHC1-WI12 (23).JPG
1,2012-01-24,Coyote,WI12/DPT/D10-VHC1-WI12/D10-VHC1-WI12 (24).JPG
2,2012-01-24,Coyote,WI12/DPT/D10-VHC1-WI12/D10-VHC1-WI12 (25).JPG
3,2012-01-24,Coyote,WI12/DPT/D10-VHC1-WI12/D10-VHC1-WI12 (26).JPG
4,2012-01-26,Rabbit,WI12/DPT/D10-VHC1-WI12/D10-VHC1-WI12 (27).JPG


In [6]:
df2 = pd.read_csv(
    Path(os.getenv("AUTOFOCUS_DATA_DIR"))
    / "lpz_2016_2017"
    / "processed"
    / "labels.csv",
    parse_dates=["date"],
    usecols=["filename", "label", "date", "location"],
)

In [7]:
def get_path(filename_build):
    if filename_build.startswith("FA"):
        filename_build = Path(filename_build)
        return (
            Path(os.getenv("AUTOFOCUS_DATA_DIR"))
            / "lpz_2012-2014"
            / "raw"
            / Path(*filename_build.parts[:2])
            / "-".join(filename_build.parts[2].split("-")[:2])
            / Path(*filename_build.parts[3:])
        )
    else:
        return (
            Path(os.getenv("AUTOFOCUS_DATA_DIR"))
            / "lpz_2012-2014"
            / "raw"
            / filename_build
        )

In [8]:
df1.loc[:, "path"] = df1.loc[:, "filename_build"].progress_apply(get_path)

100%|██████████| 74560/74560 [00:02<00:00, 30866.77it/s]


In [9]:
df1 = df1.loc[df1.loc[:, "path"].apply(lambda x: x.exists()), :]

In [10]:
len(df1)

69346

In [11]:
df2.head()

Unnamed: 0,filename,label,date,location
0,CHIL - D02-BMT1-JU16_00128.JPG,human,2016-07-12,BMT1
1,CHIL - D02-BMT1-JU16_00129.JPG,human,2016-07-12,BMT1
2,CHIL - D02-BMT1-JU16_00130.JPG,empty,2016-07-12,BMT1
3,CHIL - D02-BMT1-JU16_00131.JPG,human,2016-07-12,BMT1
4,CHIL - D02-BMT1-JU16_00132.JPG,human,2016-07-12,BMT1


In [12]:
df2.loc[:, "path"] = df2.loc[:, "filename"].progress_apply(
    lambda x: (
        Path(os.getenv("AUTOFOCUS_DATA_DIR"))
        / "lpz_2016_2017"
        / "processed"
        / "images"
        / x
    )
)

100%|██████████| 84130/84130 [00:02<00:00, 34905.40it/s]


In [13]:
df2 = df2.drop("filename", axis="columns").dropna(subset=["label"])

In [14]:
df2.head()

Unnamed: 0,label,date,location,path
0,human,2016-07-12,BMT1,/autofocus/data/lpz_2016_2017/processed/images...
1,human,2016-07-12,BMT1,/autofocus/data/lpz_2016_2017/processed/images...
2,empty,2016-07-12,BMT1,/autofocus/data/lpz_2016_2017/processed/images...
3,human,2016-07-12,BMT1,/autofocus/data/lpz_2016_2017/processed/images...
4,human,2016-07-12,BMT1,/autofocus/data/lpz_2016_2017/processed/images...


In [15]:
df1 = df1.loc[:, ["Date", "Species", "path"]]

In [16]:
df1.columns = ["date", "label", "path"]

In [17]:
df1.head()

Unnamed: 0,date,label,path
0,2012-01-24,Coyote,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
1,2012-01-24,Coyote,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
2,2012-01-24,Coyote,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
3,2012-01-24,Coyote,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
4,2012-01-26,Rabbit,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...


In [18]:
pd.set_option("display.max_rows", 200)
pd.concat((df1.loc[:, "label"], df2.loc[:, "label"])).value_counts()

empty                                  46824
Empty                                  21902
Human                                  19383
human                                  16310
Gray squirrel                           5434
bird                                    3929
raccoon                                 3525
gray squirrel                           3132
Dom dog                                 3103
Raccoon                                 2680
Deer                                    2632
Opossum                                 2415
dog                                     1454
Coyote                                  1401
Rabbit                                  1318
w. t. deer                              1218
v. opossum                              1158
e. cottontail                           1048
Unknown                                  951
Car                                      949
unknown                                  912
Squirrel                                 893
Canada goo

In [19]:
df1.loc[:, "label"].unique()

array(['Coyote', 'Rabbit', 'Empty', 'Unknown', 'Skunk', 'Opossum', 'Deer',
       'Raccoon', 'Gray squirrel', 'Squirrel', 'Fox squirrel', 'Dom dog',
       'Human', 'Red fox', 'Canada goose', 'Dom cat', 'Bird', 'Bike',
       'Rat', 'Crow', 'Fox Squirrel', 'Gray Squirrel', 'Mallard', 'human',
       'Robin', 'House sparrow', 'Hermit thrush', 'Pigeon', 'Starling',
       'Grackle', 'Mower', 'Gray Fox', 'Mouse', 'Sparrow',
       'Yellow-Rumped Warbler', 'Flying squirrel', 'Mourning dove',
       'Hairy woodpecker', 'Cardinal', 'Chipmunk', 'Woodchuck', 'Car',
       'Truck', 'Other vehicle', 'Not visible', 'Domestic Cat', 'empty',
       'Red-winged blackbird', 'Tractor', 'mower', 'bike', 'car', '`',
       'CAr', '1', 'Mink', 'Beaver', 'Red winged blackbird',
       'Brown thrasher', 'Northern flicker', 'Great blue heron',
       'Blue jay', 'Blue Jay', 'Duck', 'Red winged black bird', 'Coyote ',
       'Great Horned Owl', 'gray squirrel', 'Red-tailed hawk',
       'American Robin', 'Wh

In [20]:
df2.loc[:, "label"].unique()

array(['human', 'empty', 'dog', 'unknown', 'fox squirrel', 'bird',
       'squirrel', 'gray squirrel', 'e. cottontail', 'raccoon', 'cat',
       "squirrel (can'\nsquirrel (can't ID)", 'chipmunk', 'coyote',
       'mouse', 'lawn mower', 'gray fox', 'w. t. deer', 'woodchuck',
       'rat', 'melanistic grey squirrel', 'flying squirrel', 'v. opossum',
       'striped skunk', 'Mower', 'beaver', 'mink', 'red fox', 'muskrat'],
      dtype=object)

In [21]:
{
    item: item
    for item in sorted(
        set(df2.loc[:, "label"].unique()) | set(df1.loc[:, "label"].unique())
    )
}

{'1': '1',
 'American Robin': 'American Robin',
 'American robin': 'American robin',
 'American woodcock': 'American woodcock',
 'Beaver': 'Beaver',
 'Bike': 'Bike',
 'Bird': 'Bird',
 'Blue Jay': 'Blue Jay',
 'Blue jay': 'Blue jay',
 'Brown thrasher': 'Brown thrasher',
 'CAr': 'CAr',
 'Canada Goose': 'Canada Goose',
 'Canada goose': 'Canada goose',
 'Car': 'Car',
 'Cardinal': 'Cardinal',
 'Chipmunk': 'Chipmunk',
 'Chipping Sparrow': 'Chipping Sparrow',
 'Coyote': 'Coyote',
 'Coyote ': 'Coyote ',
 'Crow': 'Crow',
 'Deer': 'Deer',
 'Dom Cat': 'Dom Cat',
 'Dom cat': 'Dom cat',
 'Dom dog': 'Dom dog',
 'Domestic Cat': 'Domestic Cat',
 'Downy woodpecker': 'Downy woodpecker',
 'Duck': 'Duck',
 'Empty': 'Empty',
 'Eurasian Collared Dove': 'Eurasian Collared Dove',
 'Eurasian Collared-Dove': 'Eurasian Collared-Dove',
 'Flying squirrel': 'Flying squirrel',
 'Fox Squirrel': 'Fox Squirrel',
 'Fox squirrel': 'Fox squirrel',
 'Grackle': 'Grackle',
 'Gray Catbird': 'Gray Catbird',
 'Gray Fox': 'Gray 

In [22]:
labelmap = {
    "1": None,
    "American Robin": "bird",
    "American robin": "bird",
    "American woodcock": "bird",
    "Beaver": "beaver_muskrat_woodchuck",
    "Bike": "human",
    "Bird": "bird",
    "Blue Jay": "bird",
    "Blue jay": "bird",
    "Brown thrasher": "bird",
    "CAr": "empty",
    "Canada Goose": "bird",
    "Canada goose": "bird",
    "Car": "empty",
    "Cardinal": "bird",
    "Chipmunk": "chipmunk",
    "Chipping Sparrow": "bird",
    "Coyote": "coyote",
    "Coyote ": "coyote",
    "Crow": "bird",
    "Deer": "deer",
    "Dom Cat": "cat",
    "Dom cat": "cat",
    "Dom dog": "dog",
    "Domestic Cat": "cat",
    "Downy woodpecker": "bird",
    "Duck": "bird",
    "Empty": "empty",
    "Eurasian Collared Dove": "bird",
    "Eurasian Collared-Dove": "bird",
    "Flying squirrel": "squirrel",
    "Fox Squirrel": "squirrel",
    "Fox squirrel": "squirrel",
    "Grackle": "bird",
    "Gray Catbird": "bird",
    "Gray Fox": "fox",
    "Gray Squirrel": "squirrel",
    "Gray catbird": "bird",
    "Gray squirrel": "squirrel",
    "Great Horned Owl": "bird",
    "Great blue heron": "bird",
    "Hairy woodpecker": "bird",
    "Hermit thrush": "bird",
    "House sparrow": "bird",
    "Human": "human",
    "Livestock": "livestock",
    "Mallard": "bird",
    "Mink": "mink",
    "Mourning dove": "bird",
    "Mouse": "mouse",
    "Mower": "human",
    "Muskrat": "beaver_muskrat_woodchuck",
    "Northern flicker": "bird",
    "Not Visible": "empty",
    "Not visible": "empty",
    "Opossum": "opossum",
    "Other vehicle": "empty",
    "Pigeon": "bird",
    "Rabbit": "rabbit",
    "Raccoon": "raccoon",
    "Rat": "rat",
    "Red fox": "fox",
    "Red winged black bird": "bird",
    "Red winged blackbird": "bird",
    "Red-tailed hawk": "bird",
    "Red-winged blackbird": "bird",
    "Ring-billed gull": "bird",
    "Robin": "bird",
    "Skunk": "skunk",
    "Song Sparrow": "bird",
    "Sparrow": "bird",
    "Squirrel": "squirrel",
    "Starling": "bird",
    "Swainson's thrush": "bird",
    "Tractor": "empty",
    "Truck": "empty",
    "Unknown": None,
    "White-crowned sparrow": "bird",
    "White-throated sparrow": "bird",
    "Woodchuck": "beaver_muskrat_woodchuck",
    "Yellow-Rumped Warbler": "bird",
    "`": None,
    "beaver": "beaver_muskrat_woodchuck",
    "bike": "human",
    "bird": "bird",
    "canada goose": "bird",
    "car": "empty",
    "cat": "cat",
    "chipmunk": "chipmunk",
    "coyote": "coyote",
    "dog": "dog",
    "domestic cat": "cat",
    "e. cottontail": "rabbit",
    "empty": "empty",
    "flying squirrel": "squirrel",
    "fox squirrel": "squirrel",
    "gray fox": "fox",
    "gray squirrel": "squirrel",
    "human": "human",
    "lawn mower": "human",
    "melanistic grey squirrel": "squirrel",
    "mink": "mink",
    "mouse": "mouse",
    "mower": "human",
    "muskrat": "beaver_muskrat_woodchuck",
    "raccoon": "raccoon",
    "rat": "rat",
    "red fox": "fox",
    "squirrel": "squirrel",
    "squirrel (can'\nsquirrel (can't ID)": "squirrel",
    "striped skunk": "skunk",
    "unknown": None,
    "v. opossum": "opossum",
    "w. t. deer": "deer",
    "woodchuck": "beaver_muskrat_woodchuck",
}
pd.concat((df1.loc[:, "label"], df2.loc[:, "label"])).map(labelmap).value_counts(
    dropna=False
)

empty                       69852
human                       36620
squirrel                    11219
bird                         6865
raccoon                      6205
dog                          4557
deer                         3850
opossum                      3573
rabbit                       2366
NaN                          1866
coyote                       1853
skunk                        1009
cat                           783
rat                           243
mouse                         227
chipmunk                      168
fox                           141
beaver_muskrat_woodchuck       29
mink                           12
livestock                       3
Name: label, dtype: int64

In [23]:
df1.loc[:, "location"] = df1.loc[:, "path"].progress_apply(
    lambda x: x.parts[7].split("-")[1]
)

100%|██████████| 69346/69346 [00:00<00:00, 308868.32it/s]


In [24]:
df = pd.concat((df1, df2))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [25]:
df.loc[:, "label"] = df.loc[:, "label"].map(labelmap)

In [26]:
df.loc[:, "path"].value_counts().value_counts()

1    143612
2      3728
3       119
4         4
Name: path, dtype: int64

In [27]:
df.head()

Unnamed: 0,date,label,location,path
0,2012-01-24,coyote,VHC1,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
1,2012-01-24,coyote,VHC1,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
2,2012-01-24,coyote,VHC1,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
3,2012-01-24,coyote,VHC1,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...
4,2012-01-26,rabbit,VHC1,/autofocus/data/lpz_2012-2014/raw/WI12/DPT/D10...


In [28]:
df = df.dropna(subset=["label"])

In [29]:
df_grouped = df.groupby("path").agg(
    {"date": "first", "label": set, "location": "first"}
)

In [30]:
{"thing"} - {"empty"}

{'thing'}

In [31]:
df_grouped.loc[:, "label"] = df_grouped.loc[:, "label"].progress_apply(
    lambda x: x - {"empty"} if len(x) > 1 else x
)

100%|██████████| 145848/145848 [00:00<00:00, 524442.17it/s]


In [32]:
df_grouped.loc[:, "label"].progress_apply(len).value_counts()

100%|██████████| 145848/145848 [00:00<00:00, 556058.29it/s]


1    144861
2       981
3         6
Name: label, dtype: int64

In [33]:
df_grouped.loc[
    df_grouped.loc[:, "label"].progress_apply(len) > 1, "label"
].progress_apply(tuple).value_counts()

100%|██████████| 145848/145848 [00:00<00:00, 564614.35it/s]
100%|██████████| 987/987 [00:00<00:00, 369722.07it/s]


(human, dog)                            742
(bird, squirrel)                         71
(opossum, raccoon)                       44
(squirrel, rabbit)                       16
(human, bird)                            15
(human, squirrel)                        15
(bird, rabbit)                           11
(coyote, raccoon)                         4
(squirrel, skunk)                         3
(rabbit, raccoon)                         3
(deer, raccoon)                           3
(bird, rat)                               3
(squirrel, opossum)                       3
(squirrel, raccoon)                       3
(bird, dog)                               3
(rabbit, rat)                             3
(opossum, skunk)                          2
(rabbit, skunk)                           2
(dog, squirrel)                           2
(squirrel, deer)                          2
(dog, bird)                               2
(human, bird, dog)                        2
(dog, cat)                      

In [34]:
df_grouped.loc[:, "label"] = df_grouped.loc[:, "label"].progress_apply(list)

100%|██████████| 145848/145848 [00:00<00:00, 295574.70it/s]


In [35]:
df_grouped.index = df_grouped.index.astype(str)

In [37]:
df_grouped.loc[:, "label"] = df_grouped.loc[:, "label"].progress_apply(
    lambda x: [] if x[0] == "empty" else x
)

100%|██████████| 145848/145848 [00:00<00:00, 555586.10it/s]


In [38]:
np.random.seed(2)

In [39]:
val_locs = set(np.random.choice(df.loc[:, "location"].unique(), size=12))

In [40]:
df_grouped.loc[:, "is_val"] = df_grouped.loc[:, "location"].apply(
    lambda x: x in val_locs
)

In [41]:
outpath_template = "s3://autofocus/lpz_data/labels_2012_2016_2017__{}.parquet"

In [49]:
df_grouped.loc[df_grouped.loc[:, "is_val"]].rename(
    columns={"label": "tags"}
).to_parquet(outpath_template.format("val"))

In [51]:
outpath_template.format("val")

's3://autofocus/lpz_data/labels_2012_2016_2017__val.parquet'

In [50]:
df_grouped.loc[~df_grouped.loc[:, "is_val"]].rename(
    columns={"label": "tags"}
).to_parquet(outpath_template.format("train"))