# Setting up the database for training the model

This short tutorial runs through the necessary steps to set up the Koster database.

In [25]:
import pandas as pd

In [180]:
# Enter the path to movie files

In [3]:
movies_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/movies"

## 0. Video pre-processing (If your video is sufficiently compressed already and does not contain any sensitive information, you may skip this step)

In [178]:
%cd "/your/movie/directory"

[Errno 2] No such file or directory: '/your/movie/directory'
/usr/src/app/koster_data_management/db_setup


In [179]:
# This script will loop through the movie files you provided, compress them and blur areas 
# with potential sensitive information
!bash ingestion_scripts/add_blur.sh /your/movie/directory

In [None]:
# Now link this folder symbolically to the rest of the movies in the Koster Library

In [None]:
!ln -s /your/movie/directory/* movies_path

In [None]:
# Enter the correct path to the database preparation scripts

In [1]:
%cd "/usr/src/app/koster_data_management/db_setup"

/usr/src/app/koster_data_management/db_setup


## 1. Ensure all movie files are uploaded to the movie directory on the SNIC server

In [4]:
!ls $movies_path

'000114 TMBL-ROV 2000 Säckenrevet EJ numrerade band.mov'
'000114 TMBL-ROV 2000 Säckenrevet Tape 55.mov'
'000203 TMBL-ROV 2000 Säcken revet EJ numrerade band.mov'
'000203 TMBL-ROV 2000 Säcken EJ numrerade band.mov'
'000203 TMBL-ROV 2000 Säckenrevet Tape 56.mov'
'000203 TMBL-ROV 2000 säcken Tape 56.mov'
'010424 Säckenrevet alfa Tape 74.mov'
'010424 Säckenrevet beta Tape 74.mov'
 01112003.mp4
 01112003_orig.mov
 01112004.mp4
 01112004_orig.mov
 01112005.mp4
 01112005_orig.mov
 01179001.MOV
 01180001.MOV
 01181001.MOV
 01182001.MOV
 01182002.MOV
 01182005.MOV
 01182007.MOV
 01448002.mov
 01448005.mov
 01448006.mov
 01450003.mov
 01450004.mov
 01451001.mov
 01451004.mov
 01451005.mov
 01451006.mov
 01451007.mov
 01451008_1.mov
 01451009.mov
 01451010.mov
 01451011.mov
 01451011_1.mov
 01453002.mov
 01453003.mov
 01453004.mov
 01453005.mov
 01454001.mov
 01454002.mov
 01454003.mov
 01454004.mov
 01454005.mov
 01454006_1.mov
 01455001.mov
 01455

In [227]:
# Define a path for the db to be stored (example /cephyr/NOBACKUP/groups/snic2021-6-9/db_files)
# Note: you should have permissions to write to this location

In [5]:
db_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/db_files/tutorial_demo.db"

## 2. Setup the db file by using the batch script that creates the initial database from the original movies (skip to step 3 if only adding to existing database)

In [6]:
# Zooniverse credentials

In [7]:
# Specify username and password of a valid zooniverse account
user_zoo = ""
pass_zoo = ""

In [260]:
# Define suitable thresholds

In [8]:
# Specify Google Drive links of the csv files with information about the species choices,
# original movies and duplicated clips
sp_file_id = "https://drive.google.com/file/d/1dnueH3BjJrMK8buVjfyFbxfu0E-5dX7Z/view?usp=sharing"
mv_file_id = "https://drive.google.com/file/d/1xYcmMUjAawnYIyti9QNTs-oBf8XshJvs/view?usp=sharing"
dp_file_id = "https://drive.google.com/file/d/1z72CqTtEBtqk6936H1YNrCjc5NRopF0g/view?usp=sharing"

In [9]:
# Frame thresholds
obj_thresh = 0.8
eps_thresh = 0.5

In [10]:
# Specify the Zooniverse workflows of interest and their versions
workflow_clip = 11767
workflow_clip_version = 227
workflow_frame = 12852
workflow_frame_version = 21.85

# Specify the agreement threshold required among cit scientists
agg_user_clip = 0.5
agg_user_frames = 0.5

# Specifiy the min number of different Zooniverse users required per subject
min_users_clip = 3
min_users_frames = 5

In [11]:
## Run the setup script to populate the db with original movies

In [12]:
# Remove current db (optional)
!rm $db_path

In [17]:
# Initiate the db
%run -i "init.py" --db_path $db_path

# Populate the db with info from the csv files
%run -i "static.py" --species_file_id $sp_file_id --movies_file_id $mv_file_id --db_path $db_path --movies_path $movies_path

# Populate the db with info of subjects uploaded to Zooniverse
%run -i "subjects_uploaded.py" --user $user_zoo --password $pass_zoo --db_path $db_path --duplicates_file_id $dp_file_id

# Process the clips that have been classified in Zooniverse
%run -i "process_clips.py" --user $user_zoo --password $pass_zoo --db_path $db_path --duplicates_file_id $dp_file_id

# Process the frames that have been classified in Zooniverse
%run -i "process_frames.py" --user $user_zoo --password $pass_zoo --db_path $db_path --duplicates_file_id $dp_file_id

Updated sites
Updated movies
Updated species
7305
Updated subjects
Updated agg_annotations_clip
Updated agg_annotations_frame
Frame Aggregation Complete: 1350 annotations added


In [18]:
from utils import summary_utils

In [19]:
summary_utils.clips_summary(db_path)

Unnamed: 0_level_0,species_id,how_many
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Bivalves (any species),1,1.0
Black brittle star,15,51.0
Common sea pen,8,8.0
Common sea urchin,21,21.0
Common sunstar,1,1.0
Coral (any species),3,9.0
Crustacean (any species),19,19.0
Cushin star,15,15.0
Deep sea king crab,43,47.0
Deep water coral,163,361.0


## 3. Add new species and/or movies to initial database

In [20]:
import add

In [21]:
# New movies
add.add_new_movies("https://drive.google.com/file/d/1IBBm4GqZGUZvnVJ3DbIHwJ_JQQW6sCEl/view?usp=sharing", 
                   db_path,
                   movies_path)

Updated movies


In [22]:
add.add_new_movies("https://drive.google.com/file/d/19VlIQo749P__8EO-ZAL89Jk4m-1aM-s0/view?usp=sharing",
                   db_path,
                   movies_path)

Updated movies


In [23]:
add.add_new_movies("https://drive.google.com/file/d/161gRRNivEUJnkLCh7Wea7f0YXEyEJ6cv/view?usp=sharing",
                   db_path,
                   movies_path)

Updated movies


In [276]:
# New species

In [24]:
add.add_species("https://drive.google.com/file/d/18_5h4fzX7zau-JltIRPoFrltJm_cbxyb/view?usp=sharing",
                db_path)

Updated species


In [26]:
conn = db_utils.create_connection(db_path)
movies_table = pd.read_sql_query("SELECT * FROM movies", conn)

In [27]:
movies_table.head()

Unnamed: 0,id,filename,created_on,fps,duration,author,site_id,fpath
0,1,000114 TMBL-ROV 2000 Säckenrevet EJ numrerade...,14/01/2000,25.0,2836.84,,1,/cephyr/NOBACKUP/groups/snic2021-6-9/movies/00...
1,2,000114 TMBL-ROV 2000 Säckenrevet Tape 55,14/01/2000,25.0,9534.16,,1,/cephyr/NOBACKUP/groups/snic2021-6-9/movies/00...
2,3,000203 TMBL-ROV 2000 Säcken EJ numrerade band,03/02/2000,25.0,629.44,,1,/cephyr/NOBACKUP/groups/snic2021-6-9/movies/00...
3,4,000203 TMBL-ROV 2000 Säcken revet EJ numrerad...,03/02/2000,,,,1,/cephyr/NOBACKUP/groups/snic2021-6-9/movies/00...
4,5,000203 TMBL-ROV 2000 säcken Tape 56,03/02/2000,25.0,2392.0,,1,/cephyr/NOBACKUP/groups/snic2021-6-9/movies/00...


## 4. Enrich original database file with new subject sets (frames) from different species 

In [278]:
# Specify workflow id and workflow versions to be incorporated. Note that any versions higher than the specified
# version will be included for a specific workflow
clips_zoo_workflow = 17719
clips_zoo_workflow_version = 0
frames_zoo_workflow_version = 31

In [250]:
# Run the aggregation script
run = f"python process_clips.py -u {user_zoo} -p {pass_zoo} -db {db_path} -thr {agg_user_clip} \
     -nu {min_users_clip} -zw {clips_zoo_workflow} -zwv {clips_zoo_workflow_version} \
     -du https://drive.google.com/file/d/1z72CqTtEBtqk6936H1YNrCjc5NRopF0g/view?usp=sharing"
!{run}

Updated subjects
Updated agg_annotations_clip


In [251]:
summary_utils.clips_summary(db_path)

Unnamed: 0_level_0,species_id,how_many
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Bivalves (any species),2,2.0
Black brittle star,15,51.0
Cnidarian (any species),3,9.0
Common sea pen,8,8.0
Common sea urchin,21,21.0
Common sunstar,1,1.0
Coral (any species),3,9.0
Crustacean (any species),27,28.0
Cushin star,15,15.0
Dahlia anemone,5,6.0


In [252]:
# Run the aggregation script
run = f"python process_frames.py -u {user_zoo} -p {pass_zoo} -db {db_path} -obj {obj_thresh} \
     -zwv {frames_zoo_workflow_version} -eps {eps_thresh} -iua {agg_user_frames} -nu {min_users_frames} \
     -du https://drive.google.com/file/d/1z72CqTtEBtqk6936H1YNrCjc5NRopF0g/view?usp=sharing"
!{run}

UNIQUE constraint failed: subjects.id
Updated subjects
UNIQUE constraint failed: agg_annotations_frame.species_id, agg_annotations_frame.x_position, agg_annotations_frame.y_position, agg_annotations_frame.width, agg_annotations_frame.height, agg_annotations_frame.subject_id
Updated agg_annotations_frame
Frame Aggregation Complete: 1206 annotations added


In [253]:
conn = db_utils.create_connection(db_path)

In [254]:
frame_data = pd.read_sql_query("SELECT * FROM agg_annotations_frame", conn)

In [255]:
frame_data.species_id.value_counts()

5     702
8     551
25     69
2      28
Name: species_id, dtype: int64

## 5. Move on to training_tutorial.ipynb for model training

In [None]:
# END