# Github Setup

In [None]:
!git clone https://github.com/vpaulino26/fish-predict

Cloning into 'fish-predict'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 28 (delta 5), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (28/28), 23.75 MiB | 10.97 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [None]:
%cd fish-predict
%cd datasets
!unzip 22560_NEFSCFallFisheriesIndependentBottomTrawlData.zip

/content/fish-predict
/content/fish-predict/datasets
Archive:  22560_NEFSCFallFisheriesIndependentBottomTrawlData.zip
  inflating: 22560_SVDBS_CRUISES.csv  
  inflating: 22560_UNION_FSCS_SVBIO.csv  
  inflating: 22560_UNION_FSCS_SVCAT.csv  
  inflating: 22560_UNION_FSCS_SVLEN.csv  
  inflating: 22560_UNION_FSCS_SVSTA.csv  


# Make DataFrames

### We only need to worry about the SVCAT and SVSTA files

In [None]:
import pandas as pd

svc_cat_df = pd.read_csv('22560_UNION_FSCS_SVCAT.csv', dtype={'CRUISE STRATUM': str, 'STATUS_CODE': str, 'CATCH_COMMENT': str})
display(svc_cat_df.head())

  svc_cat_df = pd.read_csv('22560_UNION_FSCS_SVCAT.csv', dtype={'CRUISE STRATUM': str, 'STATUS_CODE': str, 'CATCH_COMMENT': str})


Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,SVSPP,CATCHSEX,EXPCATCHNUM,EXPCATCHWT,SCIENTIFIC_NAME,CATCH_COMMENT
0,196307,6307,1260,1,1,10,196307012600010001,15,0,10.0,19.5,Squalus acanthias (spiny dogfish),
1,196307,6307,1260,1,1,10,196307012600010001,23,0,53.0,142.0,Leucoraja ocellata (winter skate),
2,196307,6307,1260,1,1,10,196307012600010001,26,0,11.0,7.7,Leucoraja erinacea (little skate),
3,196307,6307,1260,1,1,10,196307012600010001,31,0,1.0,0.0,Etrumeus teres (round herring),
4,196307,6307,1260,1,1,10,196307012600010001,33,0,2.0,0.5,Alosa pseudoharengus (alewife),


In [None]:
sv_sta_df = pd.read_csv('22560_UNION_FSCS_SVSTA.csv', encoding='latin-1')
display(sv_sta_df.head())

  sv_sta_df = pd.read_csv('22560_UNION_FSCS_SVSTA.csv', encoding='latin-1')


Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,STATYPE,HAUL,GEARCOND,...,SURFTEMP,SURFSALIN,BOTTEMP,BOTSALIN,FULD,NO_DETAIL_SVSPP,BOTSPEED,WATCH_CHIEF_COMMENTS,STATION_COMMENTS,HABITAT_COMMENTS
0,196307,6307,1260,1,1,10,196307012600010001,1.0,2.0,3.0,...,11.0,,10.1,,,,0.0,,,
1,196307,6307,1260,2,2,10,196307012600020002,1.0,3.0,5.0,...,9.9,,9.9,,,,0.0,,,
2,196307,6307,1260,3,3,10,196307012600030003,1.0,2.0,3.0,...,9.9,,8.1,,,,0.0,,,
3,196307,6307,1260,4,4,10,196307012600040004,1.0,3.0,5.0,...,10.2,,10.1,,,,0.0,,,
4,196307,6307,1260,5,5,10,196307012600050005,1.0,2.0,3.0,...,10.0,,9.4,,,,0.0,,,


# Clean DataFrames

These are the top 10 commercial species in RI we want to focus on:
1. LONGFIN SQUID
2. SEA SCALLOP
3. AMERICAN LOBSTER
4. SHORTFIN SQUID (ILLEX)
5. SUMMER FLOUNDER
6. QUAHOG
7. SCUP
8. JONAH CRAB
9. BUTTERFISH
10. SILVER HAKE

Notes:
- the SVCCAT sheet has 'northern shortfin squid' instead of shortfin squid (illex)

### Clean SVCAT

In [None]:
# filter out non top species
top_species = ['LONGFIN SQUID', 'SEA SCALLOP', 'AMERICAN LOBSTER', 'SHORTFIN SQUID', 'SUMMER FLOUNDER', 'QUAHOG', 'SCUP', 'JONAH CRAB', 'BUTTERFISH', 'SILVER HAKE']

# Step 3: Filter only rows where the Species column contains any of these names
# (case-insensitive and matches even if common/scientific names are combined)
filtered_svcat = svc_cat_df[svc_cat_df["SCIENTIFIC_NAME"].apply(
    lambda x: any(SCIENTIFIC_NAME.lower() in str(x).lower() for SCIENTIFIC_NAME in top_species)
)]

In [None]:
# now see if it worked
# find a row that has the any of the top species in the scientific_name column by changing what's inside the quotes in the .contains("")
test_species_rows = filtered_svcat[filtered_svcat["SCIENTIFIC_NAME"].str.contains("silver hake", case=False)]
display(test_species_rows)

# it works! yay

Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,SVSPP,CATCHSEX,EXPCATCHNUM,EXPCATCHWT,SCIENTIFIC_NAME,CATCH_COMMENT
5,196307,6307,1260,1,1,10,196307012600010001,72,0,52.0,9.500,Merluccius bilinearis (silver hake),
27,196307,6307,1260,2,2,10,196307012600020002,72,0,35.0,18.100,Merluccius bilinearis (silver hake),
36,196307,6307,1260,3,3,10,196307012600030003,72,0,164.0,10.400,Merluccius bilinearis (silver hake),
56,196307,6307,1260,4,4,10,196307012600040004,72,0,7.0,3.200,Merluccius bilinearis (silver hake),
69,196307,6307,1260,5,5,10,196307012600050005,72,0,534.0,60.800,Merluccius bilinearis (silver hake),
...,...,...,...,...,...,...,...,...,...,...,...,...,...
338043,202404,202404,3610,3,808,10,202404036100030808,72,0,268.0,26.938,Merluccius bilinearis (silver hake),
338073,202404,202404,1260,2,810,10,202404012600020810,72,0,743.0,92.997,Merluccius bilinearis (silver hake),
338101,202404,202404,1260,4,812,10,202404012600040812,72,0,29.0,4.268,Merluccius bilinearis (silver hake),
338129,202404,202404,1260,6,814,10,202404012600060814,72,0,4992.0,538.639,Merluccius bilinearis (silver hake),


In [None]:
# drop columns we won't use
filtered_svcat = filtered_svcat.drop(columns=['CATCHSEX', 'CATCH_COMMENT'])
display(filtered_svcat.head())

Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,SVSPP,EXPCATCHNUM,EXPCATCHWT,SCIENTIFIC_NAME
5,196307,6307,1260,1,1,10,196307012600010001,72,52.0,9.5,Merluccius bilinearis (silver hake)
27,196307,6307,1260,2,2,10,196307012600020002,72,35.0,18.1,Merluccius bilinearis (silver hake)
36,196307,6307,1260,3,3,10,196307012600030003,72,164.0,10.4,Merluccius bilinearis (silver hake)
50,196307,6307,1260,3,3,10,196307012600030003,503,7.0,0.0,Loligo pealeii (longfin squid)
56,196307,6307,1260,4,4,10,196307012600040004,72,7.0,3.2,Merluccius bilinearis (silver hake)


### Clean SVSTA

In [17]:
sv_sta_df

Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,STATYPE,HAUL,GEARCOND,...,SURFTEMP,SURFSALIN,BOTTEMP,BOTSALIN,FULD,NO_DETAIL_SVSPP,BOTSPEED,WATCH_CHIEF_COMMENTS,STATION_COMMENTS,HABITAT_COMMENTS
0,196307,6307,1260,1,1,10,196307012600010001,1.0,2.0,3.0,...,11.00,,10.10,,,,0.0,,,
1,196307,6307,1260,2,2,10,196307012600020002,1.0,3.0,5.0,...,9.90,,9.90,,,,0.0,,,
2,196307,6307,1260,3,3,10,196307012600030003,1.0,2.0,3.0,...,9.90,,8.10,,,,0.0,,,
3,196307,6307,1260,4,4,10,196307012600040004,1.0,3.0,5.0,...,10.20,,10.10,,,,0.0,,,
4,196307,6307,1260,5,5,10,196307012600050005,1.0,2.0,3.0,...,10.00,,9.40,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21818,202404,202404,03610,3,808,10,202404036100030808,1.0,1.0,1.0,...,11.47,32.323,9.74,32.696,,,3.0,,,
21819,202404,202404,01260,2,810,10,202404012600020810,1.0,1.0,1.0,...,11.44,32.405,8.08,33.121,,,2.9,,,
21820,202404,202404,01260,4,812,10,202404012600040812,1.0,4.0,9.0,...,11.41,32.239,11.18,32.333,,,2.8,,,
21821,202404,202404,01260,6,814,10,202404012600060814,1.0,1.0,1.0,...,12.00,32.005,10.40,32.383,,,2.9,,,


In [18]:
# Drop the specified columns from the sv_sta_df DataFrame
columns_to_drop = [
    'TYPE_CODE', 'TYPE_COMMENT', 'OPERATION_CODE', 'OPERATION_COMMENT',
    'GEAR_CODE', 'GEAR_COMMENT', 'ACQUISITION_CODE', 'ACQUISITION_COMMENT',
    'OTHGEAR', 'WATCH_CHIEF_COMMENTS', 'STATION_COMMENTS', 'HABITAT_COMMENTS'
]
sv_sta_df = sv_sta_df.drop(columns=columns_to_drop)

# Display the head of the modified DataFrame
display(sv_sta_df.head())

Unnamed: 0,CRUISE6,CRUISE,STRATUM,TOW,STATION,STATUS_CODE,ID,STATYPE,HAUL,GEARCOND,...,TRASHBIO,TRASHSUB,XBT,SURFTEMP,SURFSALIN,BOTTEMP,BOTSALIN,FULD,NO_DETAIL_SVSPP,BOTSPEED
0,196307,6307,1260,1,1,10,196307012600010001,1.0,2.0,3.0,...,,,3.0,11.0,,10.1,,,,0.0
1,196307,6307,1260,2,2,10,196307012600020002,1.0,3.0,5.0,...,,,3.0,9.9,,9.9,,,,0.0
2,196307,6307,1260,3,3,10,196307012600030003,1.0,2.0,3.0,...,,,3.0,9.9,,8.1,,,,0.0
3,196307,6307,1260,4,4,10,196307012600040004,1.0,3.0,5.0,...,,,3.0,10.2,,10.1,,,,0.0
4,196307,6307,1260,5,5,10,196307012600050005,1.0,2.0,3.0,...,,,3.0,10.0,,9.4,,,,0.0


We will likely drop more columns from SVSTA as we discover which columns have the most entropy