In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline


In [7]:
# Define function to check for missing values
def check_missing_values(df, name):
  missing_counts = df.isnull().sum()
  missing_percentages = missing_counts / len(df) * 100
  print(f"\n** {name} Data Completeness Report:")
  print(f"Total Rows: {len(df)}")
  print("-" * 40)
  print(f"Column | Missing Values (Count) | Missing Values (%)")
  print("-" * 40)
  for col, count in missing_counts.items():
    percent = missing_percentages[col]
    print(f"{col:<20} | {count:<20} | {percent:.2f}%")

# Load data from CSV files
farms = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/farms.csv")
ponds = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/ponds.csv")
cycles = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv")
feeds = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/feeds.csv")
fasting = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/fasting.csv")
harvests = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv")
sampling = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/samplings.csv")
measurement = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/measurements.csv")
feed_tray = pd.read_csv("C:/Users/ahmad.wicaksana/Downloads/Data/feed_tray.csv")

# Check missing values in each DataFrame
check_missing_values(farms, "Farms")
check_missing_values(ponds, "Ponds")
check_missing_values(cycles, "Cycles")
check_missing_values(feeds, "Feeds")
check_missing_values(fasting, "Fasting")
check_missing_values(harvests, "Harvests")
check_missing_values(sampling, "Sampling")
check_missing_values(measurement, "Measurement")
check_missing_values(feed_tray, "Feed Tray")

print("\n** Overall Data Completeness Assessment:")


** Farms Data Completeness Report:
Total Rows: 551
----------------------------------------
Column | Missing Values (Count) | Missing Values (%)
----------------------------------------
id                   | 0                    | 0.00%
province             | 72                   | 13.07%
regency              | 93                   | 16.88%
timezone             | 0                    | 0.00%

** Ponds Data Completeness Report:
Total Rows: 338
----------------------------------------
Column | Missing Values (Count) | Missing Values (%)
----------------------------------------
id                   | 0                    | 0.00%
farm_id              | 0                    | 0.00%
length               | 23                   | 6.80%
width                | 29                   | 8.58%
deep                 | 92                   | 27.22%
created_at           | 0                    | 0.00%
updated_at           | 0                    | 0.00%
record_id            | 0                    | 0.00%

Calculate survival rate

In [8]:
# Define file paths
cycles_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv'
harvests_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv'
sampling_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/samplings.csv'

cycles_df = pd.read_csv(cycles_file)
harvests_df = pd.read_csv(harvests_file)

cycles.dropna(inplace=True)
harvests_df.dropna(inplace=True)


# Initialize a list to store the survival rates
survival_rates = []

for cycle_id in cycles_df['id']:
    # Filter data for the current cycle
    cycle_df = cycles_df[cycles_df['id'] == cycle_id]
    harvest_df = harvests_df[harvests_df['cycle_id'] == cycle_id]

    # Calculate total number of shrimp post-larvae (PL) stocked
    total_pl_stocked = cycle_df['total_seed'].sum()

    # Check if there are any rows in harvest_df
    if not harvest_df.empty:
        # Calculate total number of shrimp tails harvested
        total_weight_harvested = harvest_df['weight'].sum()  # Total weight of harvested shrimp in kilograms
        shrimp_per_kg = harvest_df['size'].iloc[0]  # Assuming 'size' remains constant for all rows in the cycle
        total_tails_harvested = total_weight_harvested * shrimp_per_kg

        # Calculate survival rate if total_pl_stocked is not zero
        if total_pl_stocked != 0:
            survival_rate = min((total_tails_harvested / total_pl_stocked) * 100, 100)  # Cap survival rate at 100%
            survival_rates.append(survival_rate)
            print(f"Survival Rate for cycle {cycle_id}: {survival_rate:.2f}%")
        else:
            print(f"No shrimp post-larvae (PL) stocked in cycle {cycle_id}. Unable to calculate survival rate.")
    else:
        print(f"No harvest data available for cycle {cycle_id}. Unable to calculate survival rate.")

Survival Rate for cycle 18876: 46.35%
Survival Rate for cycle 22118: 74.25%
Survival Rate for cycle 24088: 59.39%
No harvest data available for cycle 17743. Unable to calculate survival rate.
Survival Rate for cycle 17125: 8.43%
Survival Rate for cycle 28070: 100.00%
Survival Rate for cycle 26058: 89.92%
Survival Rate for cycle 26058: 89.92%
No harvest data available for cycle 24702. Unable to calculate survival rate.
No harvest data available for cycle 24007. Unable to calculate survival rate.
No harvest data available for cycle 25693. Unable to calculate survival rate.
Survival Rate for cycle 9921: 32.06%
Survival Rate for cycle 12324: 49.69%
Survival Rate for cycle 18732: 16.29%
Survival Rate for cycle 24480: 56.42%
Survival Rate for cycle 20530: 100.00%
Survival Rate for cycle 24542: 74.65%
Survival Rate for cycle 24886: 100.00%
Survival Rate for cycle 23341: 100.00%
Survival Rate for cycle 21662: 100.00%
Survival Rate for cycle 11854: 34.12%
Survival Rate for cycle 26165: 75.86%
S

Calculate ADG

In [9]:
# Load necessary data from CSV files
cycles_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv'
harvests_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv'
sampling_file = 'C:/Users/ahmad.wicaksana/Downloads/Data/samplings.csv'

cycles_df = pd.read_csv(cycles_file)
harvests_df = pd.read_csv(harvests_file)
sampling_df = pd.read_csv(sampling_file)

# Calculate survival rate, ABW, and ADG for each cycle
for cycle_id in cycles_df['id']:
    # Filter data for the current cycle
    cycle_df = cycles_df[cycles_df['id'] == cycle_id]
    harvest_df = harvests_df[harvests_df['cycle_id'] == cycle_id]
    sampling_df_cycle = sampling_df[sampling_df['cycle_id'] == cycle_id]

    # Calculate ABW if sampling data is available
    if not sampling_df_cycle.empty:
        total_weight_sampled = sampling_df_cycle['average_weight'].sum()  # Total weight of sampled shrimp in grams
        num_sampled = len(sampling_df_cycle)  # Number of shrimp sampled
        abw = total_weight_sampled / num_sampled
        #print(f"Average Body Weight (ABW) for cycle {cycle_id}: {abw:.2f} grams")

        # Calculate ADG if there are at least two sampling points
        if len(sampling_df_cycle) >= 2:
            # Sort sampling data by sampled_at date
            sampling_df_cycle = sampling_df_cycle.sort_values(by='sampled_at')

            # Calculate time interval between consecutive samplings in days
            sampling_df_cycle['sampled_at'] = pd.to_datetime(sampling_df_cycle['sampled_at'])  # Convert sampled_at to datetime
            sampling_df_cycle['time_interval'] = sampling_df_cycle['sampled_at'].diff().dt.days

            # Calculate ABW difference between consecutive samplings
            sampling_df_cycle['abw_diff'] = sampling_df_cycle['average_weight'].diff()

            # Calculate ADG
            adg = sampling_df_cycle['abw_diff'].mean() / sampling_df_cycle['time_interval'].mean()
            print(f"Average Daily Gain (ADG) for cycle {cycle_id}: {adg:.2f} grams/day")
        else:
            print(f"Insufficient sampling data available for calculating ADG in cycle {cycle_id}.")
    else:
        print(f"No sampling data available for cycle {cycle_id}. Unable to calculate ABW and ADG.")


Average Daily Gain (ADG) for cycle 18876: 0.15 grams/day
Insufficient sampling data available for calculating ADG in cycle 22118.
Average Daily Gain (ADG) for cycle 24088: 0.30 grams/day
Average Daily Gain (ADG) for cycle 17743: 0.27 grams/day
Average Daily Gain (ADG) for cycle 17125: 0.17 grams/day
Average Daily Gain (ADG) for cycle 28070: 0.25 grams/day
Average Daily Gain (ADG) for cycle 26058: 0.26 grams/day
Average Daily Gain (ADG) for cycle 26058: 0.26 grams/day
No sampling data available for cycle 24702. Unable to calculate ABW and ADG.
No sampling data available for cycle 24007. Unable to calculate ABW and ADG.
No sampling data available for cycle 25693. Unable to calculate ABW and ADG.
Average Daily Gain (ADG) for cycle 9921: 0.16 grams/day
Average Daily Gain (ADG) for cycle 12324: 0.11 grams/day
Average Daily Gain (ADG) for cycle 18732: 0.22 grams/day
Average Daily Gain (ADG) for cycle 24480: 0.10 grams/day
Average Daily Gain (ADG) for cycle 20530: 0.20 grams/day
Average Daily

  sampling_df_cycle['sampled_at'] = pd.to_datetime(sampling_df_cycle['sampled_at'])  # Convert sampled_at to datetime


Average Daily Gain (ADG) for cycle 9094: 0.23 grams/day
Average Daily Gain (ADG) for cycle 4241: 0.31 grams/day
Average Daily Gain (ADG) for cycle 12860: 0.37 grams/day
Average Daily Gain (ADG) for cycle 26552: 0.04 grams/day
Insufficient sampling data available for calculating ADG in cycle 20269.
Average Daily Gain (ADG) for cycle 15682: 0.15 grams/day
Average Daily Gain (ADG) for cycle 22355: 0.33 grams/day
Average Daily Gain (ADG) for cycle 23297: 0.40 grams/day
Average Daily Gain (ADG) for cycle 19369: 0.31 grams/day
Average Daily Gain (ADG) for cycle 19691: 0.15 grams/day
Average Daily Gain (ADG) for cycle 16424: 0.29 grams/day
Average Daily Gain (ADG) for cycle 10241: 0.25 grams/day
Average Daily Gain (ADG) for cycle 22534: 0.25 grams/day
Average Daily Gain (ADG) for cycle 26476: 0.21 grams/day
Average Daily Gain (ADG) for cycle 25375: 0.08 grams/day
No sampling data available for cycle 17456. Unable to calculate ABW and ADG.
Average Daily Gain (ADG) for cycle 27696: 0.13 grams/d

  adg = sampling_df_cycle['abw_diff'].mean() / sampling_df_cycle['time_interval'].mean()


Average Daily Gain (ADG) for cycle 22266: 0.19 grams/day
Average Daily Gain (ADG) for cycle 11042: 0.28 grams/day
Average Daily Gain (ADG) for cycle 23733: 0.25 grams/day
Average Daily Gain (ADG) for cycle 15515: 0.43 grams/day
Insufficient sampling data available for calculating ADG in cycle 17945.
Average Daily Gain (ADG) for cycle 4044: 0.25 grams/day
Insufficient sampling data available for calculating ADG in cycle 18759.
Average Daily Gain (ADG) for cycle 12716: 0.19 grams/day
Average Daily Gain (ADG) for cycle 16349: 0.28 grams/day
No sampling data available for cycle 18887. Unable to calculate ABW and ADG.
No sampling data available for cycle 10017. Unable to calculate ABW and ADG.
Average Daily Gain (ADG) for cycle 24948: 0.09 grams/day
No sampling data available for cycle 19373. Unable to calculate ABW and ADG.
Average Daily Gain (ADG) for cycle 16384: 0.07 grams/day
Average Daily Gain (ADG) for cycle 19067: 0.14 grams/day
Average Daily Gain (ADG) for cycle 17194: 0.28 grams/d

  adg = sampling_df_cycle['abw_diff'].mean() / sampling_df_cycle['time_interval'].mean()


Average Daily Gain (ADG) for cycle 23330: 0.19 grams/day
Average Daily Gain (ADG) for cycle 19872: 0.28 grams/day
Average Daily Gain (ADG) for cycle 24054: 0.20 grams/day
Average Daily Gain (ADG) for cycle 24155: 0.31 grams/day
Average Daily Gain (ADG) for cycle 20672: 0.18 grams/day
Average Daily Gain (ADG) for cycle 21663: 0.12 grams/day
Average Daily Gain (ADG) for cycle 11340: 0.26 grams/day
Average Daily Gain (ADG) for cycle 19306: 0.20 grams/day
Average Daily Gain (ADG) for cycle 19518: 0.31 grams/day
Average Daily Gain (ADG) for cycle 17437: 0.28 grams/day
Average Daily Gain (ADG) for cycle 4238: 0.37 grams/day
Average Daily Gain (ADG) for cycle 18130: 0.23 grams/day
Average Daily Gain (ADG) for cycle 23109: 0.16 grams/day
Average Daily Gain (ADG) for cycle 24570: 0.15 grams/day
Average Daily Gain (ADG) for cycle 19088: 0.10 grams/day
Insufficient sampling data available for calculating ADG in cycle 15490.
No sampling data available for cycle 20149. Unable to calculate ABW and A

  adg = sampling_df_cycle['abw_diff'].mean() / sampling_df_cycle['time_interval'].mean()
  adg = sampling_df_cycle['abw_diff'].mean() / sampling_df_cycle['time_interval'].mean()


Average Daily Gain (ADG) for cycle 17063: 0.30 grams/day
Average Daily Gain (ADG) for cycle 18538: 0.22 grams/day
Average Daily Gain (ADG) for cycle 21661: 0.16 grams/day
Average Daily Gain (ADG) for cycle 28843: 0.15 grams/day
Insufficient sampling data available for calculating ADG in cycle 21360.
Average Daily Gain (ADG) for cycle 10201: 0.27 grams/day
Average Daily Gain (ADG) for cycle 15809: 0.27 grams/day
Average Daily Gain (ADG) for cycle 18027: 0.20 grams/day
Average Daily Gain (ADG) for cycle 23709: 0.27 grams/day
Insufficient sampling data available for calculating ADG in cycle 29264.
Average Daily Gain (ADG) for cycle 12861: 0.29 grams/day
Average Daily Gain (ADG) for cycle 24695: 0.21 grams/day
Average Daily Gain (ADG) for cycle 18834: 0.09 grams/day
Average Daily Gain (ADG) for cycle 17879: 0.22 grams/day
Average Daily Gain (ADG) for cycle 5435: 0.36 grams/day
No sampling data available for cycle 10169. Unable to calculate ABW and ADG.
Average Daily Gain (ADG) for cycle 15

Survival Rate prediction

In [10]:
# Load data
cycles_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv')
feeds_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/feeds.csv')
harvest_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv')
fasting_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/fasting.csv')

# Merge relevant data using 'id' column from 'cycles_df'
merged_df = pd.merge(cycles_df, harvests_df, left_on='id', right_on='cycle_id')
merged_df = pd.merge(merged_df, feeds_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')
merged_df = pd.merge(merged_df, fasting_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')


# Feature Engineering
merged_df['cultivation_duration'] = (pd.to_datetime(merged_df['finished_at']) - pd.to_datetime(merged_df['started_at'])).dt.days
merged_df['total_feed_given'] = merged_df['quantity']
merged_df['total_fasting_days'] = merged_df['fasting']

# Fill missing values
merged_df['total_feed_given'].fillna(0, inplace=True)
merged_df['total_fasting_days'].fillna(0, inplace=True)

# Calculate Survival Rate
merged_df['survival_rate'] = (merged_df['size'] / merged_df['total_seed']) * 100

# Select features for modeling
features = ['total_seed', 'total_feed_given', 'total_fasting_days', 'cultivation_duration']
X = merged_df[features]
y = merged_df['survival_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', RandomForestRegressor(random_state=42))  # RandomForestRegressor model
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['total_feed_given'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['total_fasting_days'].fillna(0, inplace=True)


Mean Squared Error: 0.20996865095130934


ABW prediction

In [11]:
# Load data
sampling_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/samplings.csv')
cycles_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv')
feeds_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/feeds.csv')
measurements_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/measurements.csv')

# Merge relevant dataframes
merged_df = pd.merge(sampling_df, cycles_df, left_on='cycle_id', right_on='id', how='left')
merged_df = pd.merge(merged_df, feeds_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')
merged_df = pd.merge(merged_df, measurements_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')



# Create a composite water quality index
merged_df['water_quality_index'] = (merged_df['morning_temperature'] +
                                    merged_df['evening_temperature'] +
                                    merged_df['morning_do'] +
                                    merged_df['evening_do'] +
                                    merged_df['morning_salinity'] +
                                    merged_df['evening_salinity']) / 6.0


# Interaction term between feed quantity and water temperature
merged_df['feed_temp_interaction'] = merged_df['quantity'] * merged_df['morning_temperature']

# Duration of cultivation cycle
merged_df['duration_days'] = (pd.to_datetime(merged_df['finished_at']) - pd.to_datetime(merged_df['started_at'])).dt.days

# Previous cycle performance (average weight of shrimp in the previous cycle)
merged_df['prev_cycle_avg_weight'] = merged_df.groupby('pond_id_y')['average_weight'].shift(1)


# Define features and target variable
features = ['water_quality_index',
            'feed_temp_interaction', 'duration_days', 'prev_cycle_avg_weight']
target = 'average_weight'

X = merged_df[features]
y = merged_df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform simple imputation
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train_imputed, y_train)

# Predict
y_pred = model.predict(X_test_imputed)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 12.59986108911099


Biomass prediction

In [12]:
# Load data from CSV files
farms_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/farms.csv')
ponds_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/ponds.csv')
cycles_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/cycles.csv')
feeds_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/feeds.csv')
harvests_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv')
sampling_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/samplings.csv')
measurements_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/measurements.csv')

# Merge relevant dataframes
merged_df = pd.merge(cycles_df, ponds_df, on='id', suffixes=('_cycle', '_pond'))
merged_df = pd.merge(merged_df, harvests_df.groupby('cycle_id').sum().reset_index(), left_on='id', right_on='cycle_id', how='left')
merged_df = pd.merge(merged_df, feeds_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')
merged_df = pd.merge(merged_df, sampling_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')
merged_df = pd.merge(merged_df, measurements_df.groupby('cycle_id').sum().reset_index(), on='cycle_id', how='left')


# Feature engineering
merged_df['duration_days'] = (pd.to_datetime(merged_df['finished_at']) - pd.to_datetime(merged_df['started_at'])).dt.days
merged_df['total_feed_kg'] = merged_df['quantity'].fillna(0)  # Fill NaN values with 0 for feed quantity
merged_df['average_weight_kg'] = merged_df['average_weight'] / 1000  # Convert average weight from grams to kilograms

# Selecting relevant features
features = ['duration_days', 'total_feed_kg', 'morning_temperature', 'evening_temperature', 'morning_do', 'evening_do',
            'morning_salinity', 'evening_salinity', 'morning_pH', 'evening_pH', 'area']

X = merged_df[features]
y = merged_df['average_weight_kg']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_imputed, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Evaluate model on test set
y_pred = best_rf_model.predict(X_test_imputed)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 5.540332660000013e-05


Revenue prediction

In [13]:
# Load data
harvests_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/harvests.csv')
measurements_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/measurements.csv')
feeds_df = pd.read_csv('C:/Users/ahmad.wicaksana/Downloads/Data/feeds.csv')

# Merge relevant dataframes
merged_df = pd.merge(feeds_df, cycles_df, left_on='cycle_id', right_on='id', how='left')
merged_df = pd.merge(merged_df, harvests_df.groupby('cycle_id').sum().reset_index(), left_on='id', right_on='cycle_id', how='left')

# Feature Engineering
merged_df['cycle_duration'] = (pd.to_datetime(merged_df['finished_at']) - pd.to_datetime(merged_df['started_at'])).dt.days
merged_df['shrimp_growth_rate'] = merged_df['size'] / merged_df['cycle_duration']
merged_df['feeding_rate'] = merged_df['quantity'] / merged_df['cycle_duration']

# You can continue feature engineering based on the other suggestions

# Selecting relevant features and target variable
features = ['cycle_duration', 'shrimp_growth_rate', 'feeding_rate', 'area', 'size', 'weight']
target = 'selling_price'
imputer = SimpleImputer(strategy='mean')
merged_df[features] = imputer.fit_transform(merged_df[features])
merged_df['selling_price'] = imputer.fit_transform(merged_df[['selling_price']])

X = merged_df[features]
y = merged_df[target]  # Assuming 'revenue' is the target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train a regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict revenue
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)



Mean Squared Error: 176309805954.4623


In [14]:
import joblib

# Save the trained model to a file
joblib.dump(pipeline, 'survival_rate_model.pkl')
joblib.dump(model, 'average_weight_prediction_model.pkl')
joblib.dump(best_rf_model, 'biomass.pkl')
joblib.dump(model, 'revenue.pkl')

['revenue.pkl']