## Importing the necessary libraries

In [134]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# Import the StandardScaler
from sklearn.preprocessing import StandardScaler, RobustScaler

# model to be applied
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
# export the model
import pickle

In [119]:
# Read the CSV file, 
df = pd.read_csv('IoTPond3.csv')

In [120]:
# Display the data
df.head()

Unnamed: 0,created_at,entry_id,Temperature(C),Turbidity(NTU),Dissolved Oxygen(g/ml),PH,Ammonia(g/ml),Nitrate(g/ml),Population,Fish_Length(cm),Fish_Weight(g)
0,2021-06-19 00:00:04 CET,1941,23.75,80,27.736,7.04911,5.15546,114,50,6.74,3.2
1,2021-06-19 00:00:26 CET,1942,23.75,80,4.195,7.0945,4.53072,114,50,6.74,3.2
2,2021-06-19 00:02:03 CET,1945,23.75,80,10.31,7.07635,5.21473,113,50,6.74,3.2
3,2021-06-19 00:02:26 CET,1946,23.75,81,1.196,7.07181,5.41747,100,50,6.74,3.2
4,2021-06-19 00:03:31 CET,1948,23.75,80,2.338,7.08996,5.45899,112,50,6.74,3.2


In [121]:
# Retrieve the dimensions of the DataFrame 'tempdata'
df.shape

(169185, 11)

In [122]:
# Check column names using .columns attribute
column_names = df.columns
print(column_names)

Index(['created_at', 'entry_id', 'Temperature(C)', 'Turbidity(NTU)',
       'Dissolved Oxygen(g/ml)', 'PH', 'Ammonia(g/ml)', 'Nitrate(g/ml)',
       'Population', 'Fish_Length(cm)', 'Fish_Weight(g)'],
      dtype='object')


In [123]:
# Remove rows with missing values
df = df.dropna()

In [124]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
created_at                0
entry_id                  0
Temperature(C)            0
Turbidity(NTU)            0
Dissolved Oxygen(g/ml)    0
PH                        0
Ammonia(g/ml)             0
Nitrate(g/ml)             0
Population                0
Fish_Length(cm)           0
Fish_Weight(g)            0
dtype: int64


In [125]:
# Remove duplicate rows: Look for and remove duplicate rows if they exist.
df.drop_duplicates(inplace=True)

In [126]:
# Check for and handle missing values
df.dropna(inplace=True, axis=1)

In [127]:
df.shape

(169078, 11)

In [128]:
# Check for infinite values in each column
for column in df.columns:
    has_infinite_values = df[column].isin([np.inf, -np.inf]).any()
    if has_infinite_values:
        print(f"Column '{column}' contains infinite values.")

Column 'Ammonia(g/ml)' contains infinite values.


In [129]:
df = df.drop(['created_at', 'entry_id'], axis=1)
df

Unnamed: 0,Temperature(C),Turbidity(NTU),Dissolved Oxygen(g/ml),PH,Ammonia(g/ml),Nitrate(g/ml),Population,Fish_Length(cm),Fish_Weight(g)
0,23.7500,80,27.736,7.04911,5.15546,114,50,6.74,3.20
1,23.7500,80,4.195,7.09450,4.53072,114,50,6.74,3.20
2,23.7500,80,10.310,7.07635,5.21473,113,50,6.74,3.20
3,23.7500,81,1.196,7.07181,5.41747,100,50,6.74,3.20
4,23.7500,80,2.338,7.08996,5.45899,112,50,6.74,3.20
...,...,...,...,...,...,...,...,...,...
169180,25.6875,100,5.080,1.52004,0.00000,302,50,33.00,294.92
169181,25.6600,100,5.080,1.43379,0.00000,285,50,33.00,294.92
169182,25.3750,100,5.080,1.28853,0.00000,299,50,33.00,294.92
169183,25.4375,100,5.080,1.14327,0.00000,282,50,33.00,294.92


In [130]:
# Replace infinite values in the 'Ammonia(g/ml)' column with 0
# considering that the amonia consist of several infinity value
df['Ammonia(g/ml)'] = df['Ammonia(g/ml)'].replace([np.inf, -np.inf], 0)

In [131]:
df.head()

Unnamed: 0,Temperature(C),Turbidity(NTU),Dissolved Oxygen(g/ml),PH,Ammonia(g/ml),Nitrate(g/ml),Population,Fish_Length(cm),Fish_Weight(g)
0,23.75,80,27.736,7.04911,5.15546,114,50,6.74,3.2
1,23.75,80,4.195,7.0945,4.53072,114,50,6.74,3.2
2,23.75,80,10.31,7.07635,5.21473,113,50,6.74,3.2
3,23.75,81,1.196,7.07181,5.41747,100,50,6.74,3.2
4,23.75,80,2.338,7.08996,5.45899,112,50,6.74,3.2


In [132]:
X = df.drop("Fish_Weight(g)", axis=1) # Features
y = df["Fish_Weight(g)"] # Target variable

In [135]:
# Create a RobustScaler instance
scaler = RobustScaler()
# Fit the scaler to your feature data (X)
scaler.fit(X)
# Transform your data using the scaler
X_scaled = scaler.transform(X)

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regression Model Performance Result

In [137]:
# Create and train regression models
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'R-squared (R2) Score: {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}\n')

Mean Squared Error (MSE): 0.00031113800449491315
Mean Absolute Percentage Error (MAPE): 0.00010942690283709643%
R-squared (R2) Score: 0.9999998499398451
Root Mean Squared Error (RMSE): 0.017639104413062278



In [138]:
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(random_forest_model, model_file)

print('Meta-model saved to meta_model.pkl')

Meta-model saved to meta_model.pkl


In [85]:
# Load the meta-model from the saved file
# with open('ens_meta_model.pkl', 'rb') as model_file:
#     loaded_meta_model = pickle.load(model_file)

# new_data = pd.DataFrame(...)
# predictions = loaded_meta_model.predict(new_data)