In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score

In [10]:
df = pd.read_csv("fisheries_data_ml_ready.csv")
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Production,Production_Lag1,Production_Lag2,Production_Lag3,YoY_Change,Rolling_Mean_3Y
0,Afghanistan,AFG,Total fisheries production (metric tons),ER.FSH.PROD.MT,1963,300.0,300.0,300.0,200.0,0.0,300.0
1,Afghanistan,AFG,Total fisheries production (metric tons),ER.FSH.PROD.MT,1964,300.0,300.0,300.0,300.0,0.0,300.0
2,Afghanistan,AFG,Total fisheries production (metric tons),ER.FSH.PROD.MT,1965,300.0,300.0,300.0,300.0,0.0,300.0
3,Afghanistan,AFG,Total fisheries production (metric tons),ER.FSH.PROD.MT,1966,300.0,300.0,300.0,300.0,0.0,300.0
4,Afghanistan,AFG,Total fisheries production (metric tons),ER.FSH.PROD.MT,1967,400.0,300.0,300.0,300.0,0.333333,333.333333


Data Validation

In [21]:
# Dataset Overview
print("Dataset Shape:", df.shape)
df.info()

Dataset Shape: (15906, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15906 entries, 0 to 15905
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country Name     15906 non-null  object 
 1   Country Code     15906 non-null  object 
 2   Indicator Name   15906 non-null  object 
 3   Indicator Code   15906 non-null  object 
 4   Year             15906 non-null  int64  
 5   Production       15906 non-null  float64
 6   Production_Lag1  15906 non-null  float64
 7   Production_Lag2  15906 non-null  float64
 8   Production_Lag3  15906 non-null  float64
 9   YoY_Change       14745 non-null  float64
 10  Rolling_Mean_3Y  15906 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.3+ MB


In [23]:
# Missing Values Check
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
Country Name          0
Country Code          0
Indicator Name        0
Indicator Code        0
Year                  0
Production            0
Production_Lag1       0
Production_Lag2       0
Production_Lag3       0
YoY_Change         1161
Rolling_Mean_3Y       0
dtype: int64


In [25]:
# Duplicate Check
print("Duplicate Rows:", df.duplicated().sum())

Duplicate Rows: 0


In [27]:
# Year Sorting Check (Time-Series Important)
if not df["Year"].is_monotonic_increasing:
    print("⚠ Dataset is not sorted by Year")
else:
    print("Dataset is properly sorted by Year")

⚠ Dataset is not sorted by Year


In [29]:
# Negative Production Check
if (df["Production"] < 0).any():
    print("⚠ Negative Production values detected")
else:
    print("No negative production values found")

No negative production values found


In [31]:
# Production Statistics
print(df["Production"].describe())

count    1.590600e+04
mean     4.274953e+06
std      1.733783e+07
min      0.000000e+00
25%      3.700000e+03
50%      5.607351e+04
75%      8.534914e+05
max      2.169865e+08
Name: Production, dtype: float64


In [33]:
# Outlier Detection (IQR Method)
Q1 = df["Production"].quantile(0.25)
Q3 = df["Production"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df["Production"] < lower) | (df["Production"] > upper)]

print("Number of Outliers:", len(outliers))

Number of Outliers: 2838
