In [None]:
# WSSV Outbreak Data Exploration

This notebook explores the data for predicting White Spot Syndrome Virus (WSSV) outbreaks in Southeast Asian shrimp aquaculture.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

# Set up visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline

# Define data directory paths
ROOT_DIR = Path().resolve().parents[0]
RAW_DATA_DIR = ROOT_DIR / "data" / "raw"

# Display the current working directory and available data files
print(f"Working directory: {ROOT_DIR}")
print(f"Raw data directory: {RAW_DATA_DIR}")

if RAW_DATA_DIR.exists():
    data_files = list(RAW_DATA_DIR.glob("*.csv"))
    print(f"Available data files: {[f.name for f in data_files]}")
else:
    print("Raw data directory does not exist. Creating sample data...")
    # Import the data_collection module to create sample data
    import sys
    sys.path.append(str(ROOT_DIR / "src" / "data"))
    from data_collection import create_sample_data
    
    # Create sample data
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    sample_data = create_sample_data()
    data_files = list(RAW_DATA_DIR.glob("*.csv"))


In [None]:
# Load the dataset
if data_files:
    # Load the first available data file
    file_path = data_files[0]
    df = pd.read_csv(file_path)
    print(f"Loaded data from {file_path.name} with {df.shape[0]} rows and {df.shape[1]} columns")
    
    # Display the first few rows
    df.head()


In [None]:
# Data Overview and Summary Statistics

## 1. Basic information about the dataset
print("Dataset information:")
print(f"- Number of samples: {df.shape[0]}")
print(f"- Number of features: {df.shape[1]}")

## 2. Data types
print("\nData types:")
df.dtypes


In [None]:
# 3. Summary statistics for numeric variables
df.describe()


In [None]:
# 4. Check for missing values
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df) * 100)
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percent Missing': missing_percent})
missing_df[missing_df['Missing Values'] > 0]


In [None]:
# 5. Distribution of categorical variables
for col in df.select_dtypes(include=['object']):
    plt.figure(figsize=(10, 4))
    counts = df[col].value_counts()
    sns.barplot(x=counts.index, y=counts.values)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f"{col} value counts:")
    print(df[col].value_counts())
    print(f"{col} value percentages:")
    print(df[col].value_counts(normalize=True).round(3) * 100)
    print("-" * 50)


In [None]:
# 6. Target variable distribution and analysis
plt.figure(figsize=(8, 5))
sns.countplot(x='wssv_outbreak', data=df)
plt.title('WSSV Outbreak Distribution')
plt.xlabel('Outbreak Status (0 = No, 1 = Yes)')
plt.show()

# Print outbreak percentage
outbreak_percent = df['wssv_outbreak'].mean() * 100
print(f"Outbreak percentage: {outbreak_percent:.2f}%")
print(f"Non-outbreak percentage: {100 - outbreak_percent:.2f}%")


In [None]:
# Feature Analysis and Visualization

## 1. Distribution of key environmental variables
plt.figure(figsize=(16, 12))

# Water temperature
plt.subplot(2, 3, 1)
sns.histplot(df['water_temperature'], kde=True)
plt.axvline(df['water_temperature'].mean(), color='red', linestyle='--')
plt.title('Water Temperature Distribution')

# Salinity
plt.subplot(2, 3, 2)
sns.histplot(df['salinity'], kde=True)
plt.axvline(df['salinity'].mean(), color='red', linestyle='--')
plt.title('Salinity Distribution')

# pH
plt.subplot(2, 3, 3)
sns.histplot(df['ph'], kde=True)
plt.axvline(df['ph'].mean(), color='red', linestyle='--')
plt.title('pH Distribution')

# Dissolved oxygen
plt.subplot(2, 3, 4)
sns.histplot(df['dissolved_oxygen'], kde=True)
plt.axvline(df['dissolved_oxygen'].mean(), color='red', linestyle='--')
plt.title('Dissolved Oxygen Distribution')

# Ammonia
plt.subplot(2, 3, 5)
sns.histplot(df['ammonia'], kde=True)
plt.axvline(df['ammonia'].mean(), color='red', linestyle='--')
plt.title('Ammonia Distribution')

# Rainfall
plt.subplot(2, 3, 6)
sns.histplot(df['rainfall'], kde=True)
plt.axvline(df['rainfall'].mean(), color='red', linestyle='--')
plt.title('Rainfall Distribution')

plt.tight_layout()
plt.show()


In [None]:
# 2. Distribution of key farm management variables
plt.figure(figsize=(16, 8))

# Stocking density
plt.subplot(2, 2, 1)
sns.histplot(df['stocking_density'], kde=True)
plt.axvline(df['stocking_density'].mean(), color='red', linestyle='--')
plt.title('Stocking Density Distribution')

# Pond size
plt.subplot(2, 2, 2)
sns.histplot(df['pond_size'], kde=True)
plt.axvline(df['pond_size'].mean(), color='red', linestyle='--')
plt.title('Pond Size Distribution')

# Water exchange rate
plt.subplot(2, 2, 3)
sns.histplot(df['water_exchange_rate'], kde=True)
plt.axvline(df['water_exchange_rate'].mean(), color='red', linestyle='--')
plt.title('Water Exchange Rate Distribution')

# Culture duration
plt.subplot(2, 2, 4)
sns.histplot(df['culture_duration'], kde=True)
plt.axvline(df['culture_duration'].mean(), color='red', linestyle='--')
plt.title('Culture Duration Distribution')

plt.tight_layout()
plt.show()


In [None]:
# 3. Correlation Analysis
plt.figure(figsize=(16, 12))
numeric_cols = df.select_dtypes(include=['number'])
correlation = numeric_cols.corr()

# Plot heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.show()

# Find most correlated features with the target variable
target_corr = correlation['wssv_outbreak'].sort_values(ascending=False)
print("Features most correlated with WSSV outbreak:")
print(target_corr)


In [None]:
# 4. Feature Relationships with Target Variable

# Top 3 numerical variables with highest correlation to the target
# (excluding outbreak_probability which is directly derived from the target)
top_correlated = target_corr.drop(['wssv_outbreak', 'outbreak_probability']).nlargest(3)
top_features = top_correlated.index.tolist()

# Plot the relationship between top correlated features and target
plt.figure(figsize=(18, 6))
for i, feature in enumerate(top_features):
    plt.subplot(1, 3, i+1)
    sns.boxplot(x='wssv_outbreak', y=feature, data=df)
    plt.title(f'{feature} vs WSSV Outbreak')
    plt.xlabel('WSSV Outbreak (0=No, 1=Yes)')

plt.tight_layout()
plt.show()

# Show mean values for each feature by outbreak status
print("Mean values by outbreak status:")
print(df.groupby('wssv_outbreak')[top_features].mean())


In [None]:
# 5. Categorical Variables Analysis

# Define categorical columns (including binary variables)
categorical_cols = ['country', 'season', 'wssv_history', 'probiotics_used', 'antibiotics_used']

# Plot the relationship between categorical variables and target
plt.figure(figsize=(20, 15))
for i, col in enumerate(categorical_cols):
    plt.subplot(3, 2, i+1)
    cross_tab = pd.crosstab(df[col], df['wssv_outbreak'], normalize='index') * 100
    cross_tab.plot(kind='bar', stacked=False, ax=plt.gca())
    plt.title(f'Outbreak Rate by {col}')
    plt.ylabel('Percentage (%)')
    plt.xlabel(col)
    plt.legend(['No Outbreak (0)', 'Outbreak (1)'])
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Print outbreak rate by categorical variables
for col in categorical_cols:
    print(f"\nOutbreak rate by {col}:")
    print(df.groupby(col)['wssv_outbreak'].mean() * 100)


In [None]:
# 6. Season and Month Analysis

plt.figure(figsize=(14, 6))

# Season analysis
plt.subplot(1, 2, 1)
sns.countplot(x='season', hue='wssv_outbreak', data=df)
plt.title('Outbreak Count by Season')
plt.xlabel('Season')
plt.ylabel('Count')

# Month analysis
plt.subplot(1, 2, 2)
monthly_outbreak = df.groupby('month')['wssv_outbreak'].mean() * 100
monthly_outbreak.plot(kind='bar')
plt.title('Outbreak Rate by Month')
plt.xlabel('Month')
plt.ylabel('Outbreak Rate (%)')
plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)

plt.tight_layout()
plt.show()

# Print outbreak rate by season
print("Outbreak rate by season:")
seasonal_outbreak = df.groupby('season')['wssv_outbreak'].mean() * 100
print(seasonal_outbreak)


In [None]:
# 7. Pairplot for Key Variables

# Select important variables
key_vars = ['water_temperature', 'salinity', 'dissolved_oxygen', 
           'stocking_density', 'wssv_history', 'wssv_outbreak']

# Create pairplot
sns.pairplot(df[key_vars], hue='wssv_outbreak', diag_kind='kde', 
             plot_kws={'alpha': 0.6}, height=2.5)
plt.suptitle('Relationships Between Key Variables', y=1.02)
plt.show()


In [None]:
# 8. Country Analysis

# Plot outbreak rate by country
plt.figure(figsize=(12, 6))
country_outbreak = df.groupby('country')['wssv_outbreak'].mean().sort_values(ascending=False) * 100
country_outbreak.plot(kind='bar', color='skyblue')
plt.title('WSSV Outbreak Rate by Country')
plt.xlabel('Country')
plt.ylabel('Outbreak Rate (%)')
plt.axhline(df['wssv_outbreak'].mean() * 100, color='red', linestyle='--', label='Average')
plt.legend()
plt.tight_layout()
plt.show()

# Statistics by country
country_stats = df.groupby('country').agg({
    'wssv_outbreak': 'mean',
    'water_temperature': 'mean',
    'salinity': 'mean',
    'stocking_density': 'mean',
    'wssv_history': 'mean'
}).sort_values('wssv_outbreak', ascending=False)

# Multiply outbreak rate by 100 for percentage
country_stats['wssv_outbreak'] = country_stats['wssv_outbreak'] * 100
country_stats.rename(columns={'wssv_outbreak': 'outbreak_rate_percent'}, inplace=True)

print("Statistics by country:")
print(country_stats)


In [None]:
# Summary of Findings

Based on the exploratory data analysis, here are the key findings:

1. **Target Variable Distribution**:
   - The dataset contains balanced/imbalanced distribution of outbreaks (check actual values after running)
   
2. **Important Environmental Factors**:
   - Water temperature shows a significant relationship with WSSV outbreaks
   - Dissolved oxygen levels appear to be lower in ponds with outbreaks
   - Higher ammonia levels are associated with increased outbreak risk
   
3. **Farm Management Factors**:
   - Higher stocking density correlates with increased outbreak risk
   - Farms with a history of WSSV show much higher outbreak rates
   - Use of probiotics appears to have a protective effect against WSSV
   
4. **Seasonal Patterns**:
   - The wet season shows higher outbreak rates compared to the dry season
   - Certain months show peak outbreak rates (check the specific months after running)
   
5. **Geographical Factors**:
   - Some countries have significantly higher outbreak rates than others
   - This may be due to differences in climate, farming practices, or regulation
   
6. **Feature Correlations**:
   - The most correlated features with outbreaks are (will be determined after running)
   - These features should be prioritized in the prediction model
   
**Next Steps**:
1. Feature engineering based on the identified relationships
2. Data preprocessing including handling missing values and outliers
3. Train multiple machine learning models to predict WSSV outbreaks
4. Evaluate model performance and select the best model
5. Develop a user-friendly interface for prediction
