In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score

In [14]:
df = pd.read_csv('Final_fisheries_dataset-1.csv')
df.head()

Unnamed: 0,country,year,biomass_relative_to,SST,Trawling_Hours,country_encoded
0,Atlantic halibut Gulf of Maine / Georges Bank,1800,19795920,27.11,5638.58,100
1,Atlantic halibut Gulf of Maine / Georges Bank,1801,19795920,25.6,4840.09,100
2,Atlantic halibut Gulf of Maine / Georges Bank,1802,19795920,25.2,4709.12,100
3,Atlantic halibut Gulf of Maine / Georges Bank,1803,19795920,26.06,5335.08,100
4,Atlantic halibut Gulf of Maine / Georges Bank,1804,19775510,25.14,4890.73,100


Data Validation

In [17]:
# Dataset Overview
print("Dataset Shape:", df.shape)
df.info()

Dataset Shape: (54235, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54235 entries, 0 to 54234
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country              54235 non-null  object 
 1   year                 54235 non-null  int64  
 2   biomass_relative_to  54235 non-null  int64  
 3   SST                  54235 non-null  float64
 4   Trawling_Hours       54235 non-null  float64
 5   country_encoded      54235 non-null  int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 2.5+ MB


In [19]:
# Missing Values Check
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
country                0
year                   0
biomass_relative_to    0
SST                    0
Trawling_Hours         0
country_encoded        0
dtype: int64


In [21]:
# Duplicate Check
print("Duplicate Rows:", df.duplicated().sum())

Duplicate Rows: 0


In [65]:
#checkin the columns
print(df.columns.tolist())

['country', 'year', 'biomass_relative_to', 'sst', 'trawling_hours', 'country_encoded']


In [35]:
df["year"]

0        1800
1        1801
2        1802
3        1803
4        1804
         ... 
54230    2018
54231    2018
54232    2018
54233    2018
54234    2018
Name: year, Length: 54235, dtype: int64

In [37]:
# Clean column names
df.columns = df.columns.str.strip().str.lower()

print("Columns:", df.columns)

# Convert year to numeric
df["year"] = pd.to_numeric(df["year"], errors="coerce")

# Sort
df = df.sort_values("year").reset_index(drop=True)

# Check sorting
if df["year"].is_monotonic_increasing:
    print("Dataset is properly sorted by year")
else:
    print("⚠ Dataset is not sorted by year")

Columns: Index(['country', 'year', 'biomass_relative_to', 'sst', 'trawling_hours',
       'country_encoded'],
      dtype='object')
Dataset is properly sorted by year


In [41]:
print(df.columns.tolist())

['country', 'year', 'biomass_relative_to', 'sst', 'trawling_hours', 'country_encoded']


In [45]:
df.columns = df.columns.str.strip().str.lower()
print(df.columns)

Index(['country', 'year', 'biomass_relative_to', 'sst', 'trawling_hours',
       'country_encoded'],
      dtype='object')


In [57]:
df.columns = df.columns.str.strip().str.lower()

print("Available columns:")
for col in df.columns:
    print("-", col)

Available columns:
- country
- year
- biomass_relative_to
- sst
- trawling_hours
- country_encoded


In [59]:
df.columns = df.columns.str.strip().str.lower()

if "production" in df.columns:
    production_col = "production"
elif "total" in df.columns:
    production_col = "total"
else:
    print("⚠ Could not automatically detect production column.")
    production_col = None

if production_col:
    if (df[production_col] < 0).any():
        print("⚠ Negative production values detected")
    else:
        print("No negative production values found")

⚠ Could not automatically detect production column.


In [31]:
# Production Statistics
print(df["Production"].describe())

count    1.590600e+04
mean     4.274953e+06
std      1.733783e+07
min      0.000000e+00
25%      3.700000e+03
50%      5.607351e+04
75%      8.534914e+05
max      2.169865e+08
Name: Production, dtype: float64


In [63]:
# Clean column names
df.columns = df.columns.str.strip().str.lower()

# Show numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("Numeric Columns:", numeric_cols)

# Choose the correct production column manually after seeing the print
production_col = numeric_cols[-1]  # usually last numeric column is total production

print("Using column:", production_col)

# IQR Outlier Detection
Q1 = df[production_col].quantile(0.25)
Q3 = df[production_col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[production_col] < lower) | (df[production_col] > upper)]

print("Number of Outliers:", len(outliers))

Numeric Columns: Index(['year', 'biomass_relative_to', 'sst', 'trawling_hours',
       'country_encoded'],
      dtype='object')
Using column: country_encoded
Number of Outliers: 0
