In [1]:
!pip install opencv-python -U

Requirement already up-to-date: opencv-python in /home/shivaram/anaconda3/lib/python3.8/site-packages (4.5.5.64)


In [2]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import os
#import shutil
import urllib.request
import cv2
import pathlib
import random

# Create Hawaii Fish Table

In [3]:
html = requests.get("https://www.fishbase.se/Country/CountryChecklist.php?showAll=yes&c_code=840B&vhabitat=all2&cpresence=present")
soup = bs(html.content, "html.parser")

In [4]:
try:
    fish_df = pd.read_csv("fish_df.csv")
except:
    print("Fish Dataframe not Found. Regenerating...")

    fish_df = pd.read_html(str(table))[0]

    # Add fish page Links
    links = []
    for anchor in table.findAll('a'):
        links.append("https://www.fishbase.se/Country/" + anchor["href"])
    fish_df["Link"] = links

    # Add fishbase ids
    fishbase_ids = [link.split('=')[-1] for link in links]
    fish_df.insert(0, "fishbase_id", fishbase_ids)

    # Add image counts and image page url for each species
    pic_links = ["https://www.fishbase.se/photos/thumbnailssummary.php?ID=" + fid for fid in fishbase_ids]  
    fish_df["image_page_link"] = pic_links
    
    image_counts = []
    for link in pic_links:
        page = requests.get(link)
        p_soup = bs(page.content, "html.parser")
        images = p_soup.findAll('img')
        image_counts.append(len(images))
    fish_df["image_counts"] = image_counts    

    fish_df.to_csv("fish_df.csv", index = False)

In [5]:
fish_df.head()

Unnamed: 0,fishbase_id,Order,Family,Species,Occurrence,FishBase name,Name,Link,image_counts,image_page_link
0,972,Beloniformes,Belonidae,Ablennes hians,native,Flat needlefish,'Aha'aha,https://www.fishbase.se/Country/CountrySpecies...,30,https://www.fishbase.se/photos/thumbnailssumma...
1,6652,Ovalentaria/misc,Pomacentridae,Abudefduf abdominalis,native,Green damselfish,Mamamo,https://www.fishbase.se/Country/CountrySpecies...,10,https://www.fishbase.se/photos/thumbnailssumma...
2,5689,Ovalentaria/misc,Pomacentridae,Abudefduf sordidus,native,Blackspot sergeant,Ao'aonui,https://www.fishbase.se/Country/CountrySpecies...,69,https://www.fishbase.se/photos/thumbnailssumma...
3,6630,Ovalentaria/misc,Pomacentridae,Abudefduf vaigiensis,native,Indo-Pacific sergeant,Indo-Pacific sergeant,https://www.fishbase.se/Country/CountrySpecies...,145,https://www.fishbase.se/photos/thumbnailssumma...
4,89,Scombriformes,Scombridae,Acanthocybium solandri,native,Wahoo,Ono,https://www.fishbase.se/Country/CountrySpecies...,111,https://www.fishbase.se/photos/thumbnailssumma...


# Download Images from Fishbase

In [6]:
if not os.path.exists("fishbase_images/"):
    os.makedirs("fishbase_images/")

In [7]:
for ind, row in fish_df.iterrows():
    id_path = "fishbase_images/" + str(row["fishbase_id"])
    if os.path.exists(id_path):
        print(str(row["fishbase_id"]) + " already generated. Skipping...")
        continue
        
    os.makedirs(id_path)
        
    html = requests.get(row["image_page_link"])
    p_soup = bs(html.content, "html.parser")
    
    img_tags = p_soup.find_all('img')
    urls = []
    for url in [img['src'] for img in img_tags]:
        if (len(url.split('/'))) < 2 or url.split('/')[2] == "thumbnails" or url.split('.')[-1] not in set(["jpg", "gif", "png", "jpeg"]) :
            continue
        urls.append(url)
        
    base_url = "https://www.fishbase.se/"
    
    # Download Images
    for url in urls:
        full_url = base_url + url[3:]
        save_path = id_path+ "/" + url.split('/')[-1]
        try: 
            urllib.request.urlretrieve(full_url, save_path)
        except:
            print(f"{full_url} did not download successfully. Skipping")

972 already generated. Skipping...
6652 already generated. Skipping...
5689 already generated. Skipping...
6630 already generated. Skipping...
89 already generated. Skipping...
4306 already generated. Skipping...
4750 already generated. Skipping...
1256 already generated. Skipping...
4736 already generated. Skipping...
4737 already generated. Skipping...
1258 already generated. Skipping...
6011 already generated. Skipping...
4739 already generated. Skipping...
4738 already generated. Skipping...
4744 already generated. Skipping...
4734 already generated. Skipping...
1260 already generated. Skipping...
1261 already generated. Skipping...
58525 already generated. Skipping...
12600 already generated. Skipping...
22983 already generated. Skipping...
11512 already generated. Skipping...
63908 already generated. Skipping...
7502 already generated. Skipping...
9012 already generated. Skipping...
988 already generated. Skipping...
10 already generated. Skipping...
99 already generated. Skippin

# Generate Fish Classifier Dataset

In [8]:
cifar_path = "cifar_images/"
fishbase_path = "fishbase_images/"
cifar_images = [os.path.join(path, name) for path, subdirs, files in os.walk(cifar_path) for name in files if os.path.join(path, name).split('.')[-1] == "jpg"]
fishbase_images = [os.path.join(path, name) for path, subdirs, files in os.walk(fishbase_path) for name in files if os.path.join(path, name).split('.')[-1]in set(["jpg", "png", "jpeg"])]

In [17]:
if not os.path.exists("is_fish_images/test/fish"):
    os.makedirs("is_fish_images/test/fish")
if not os.path.exists("is_fish_images/test/not_fish"):
    os.makedirs("is_fish_images/test/not_fish")
if not os.path.exists("is_fish_images/train/fish"):
    os.makedirs("is_fish_images/train/fish")
if not os.path.exists("is_fish_images/train/not_fish"):
    os.makedirs("is_fish_images/train/not_fish")

In [14]:
# Generate small images of fish
for path in fishbase_images:
    train = (random.randint(0, 4) != 4)
    if train:
        save_path = "is_fish_images/train/fish/" + path.split('/')[-1]
        alt_path = "is_fish_images/test/fish/" + path.split('/')[-1]
    else:
        save_path = "is_fish_images/test/fish/" + path.split('/')[-1]
        alt_path = "is_fish_images/train/fish/" + path.split('/')[-1]
    if os.path.exists(save_path) or os.path.exists(alt_path): 
        continue
        
    img = cv2.imread(path, 1)
    try:
        img_stretch = cv2.resize(img, (32, 32))
    except:
        print(f"{path} resize failed. Skipping...")
        continue
      
    cv2.imwrite(save_path, img_stretch)

    


fishbase_images/1261/acxan_u3.jpg resize failed. Skipping...
fishbase_images/457/1378126376_50.150.156.71.jpg resize failed. Skipping...


In [18]:
# Copy cifar images to not_fish directory
for path in cifar_images:
    train = (random.randint(0, 4) != 4)
    if train:
        save_path = "is_fish_images/train/not_fish/" + path.split('/')[-2] + "_" + path.split('/')[-1]
        alt_path = "is_fish_images/test/not_fish/" + path.split('/')[-2] + "_" + path.split('/')[-1]
    else:
        save_path = "is_fish_images/test/not_fish/" + path.split('/')[-2] + "_" + path.split('/')[-1]
        alt_path = "is_fish_images/train/not_fish/" + path.split('/')[-2] + "_" + path.split('/')[-1]
    if os.path.exists(save_path) or os.path.exists(alt_path):
        continue
        
    img = cv2.imread(path, 1)
    img = cv2.resize(img, (32, 32))
    cv2.imwrite(save_path, img)

In [30]:
random.randint(0, 5)

6

- Layers : is it fish? Filter extremely low probability images (eg https://www.fishbase.se/photos/thumbnailssummary.php?ID=1261)
- Take curriculum learning like approach for google images? (train initial model without and then get confidences for google images)
- mask images with blue filters in augmentation
NOTE: will need to understand gpu limit, how to best train, and best input size


In [12]:
cv2.destroyAllWindows()