In [5]:
import pytest
import pandas as pd
from data_loader import DataLoader
from data_cleaner import DataCleaner
from feature_selection import FeatureSelector
from feature_engineering import FeatureEngineer
from data_preprocessor import DataPreprocessor
pd.set_option('display.max_columns', None)


# Load data
data_loader = DataLoader()
data = data_loader.load_data()

# Show BEFORE
print(data.info())

# Clean data
data_cleaner = DataCleaner()
data_cleaned = data_cleaner.clean_data(data)
# Feature selection
feature_selector = FeatureSelector()
data_selected = feature_selector.select_features(data_cleaned)
# Feature engineering
feature_engineer = FeatureEngineer()
data_engineered = feature_engineer.engineer_features(data_selected)
# Preprocess data
preprocessor = DataPreprocessor()   
data_preprocessed = preprocessor.preprocess(data_engineered)

# Show AFTER
data_preprocessed.head()

[32m2025-03-14 09:16:54.591[0m | [1mINFO    [0m | [36mdata_loader[0m:[36mload_data[0m:[36m34[0m - [1mLoading data from /Users/theopenguino/Downloads/aiip5-Er-Qi-Yang-227J/data/agri.db[0m


Loading data from /Users/theopenguino/Downloads/aiip5-Er-Qi-Yang-227J/data/agri.db


[32m2025-03-14 09:16:54.971[0m | [1mINFO    [0m | [36mdata_loader[0m:[36mload_data[0m:[36m58[0m - [1mData loaded successfully with shape: (57489, 15)[0m
[32m2025-03-14 09:16:54.972[0m | [34m[1mDEBUG   [0m | [36mdata_loader[0m:[36mload_data[0m:[36m59[0m - [34m[1mColumns in data: ['System Location Code', 'Previous Cycle Plant Type', 'Plant Type', 'Plant Stage', 'Temperature Sensor (°C)', 'Humidity Sensor (%)', 'Light Intensity Sensor (lux)', 'CO2 Sensor (ppm)', 'EC Sensor (dS/m)', 'O2 Sensor (ppm)', 'Nutrient N Sensor (ppm)', 'Nutrient P Sensor (ppm)', 'Nutrient K Sensor (ppm)', 'pH Sensor', 'Water Level Sensor (mm)'][0m
[32m2025-03-14 09:16:55.035[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36mclean_data[0m:[36m36[0m - [1mStarting data cleaning process[0m
[32m2025-03-14 09:16:55.043[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36mclean_nutrient_sensors[0m:[36m65[0m - [1mCleaning nutrient sensor columns[0m
[32m2025-03-14 09:16:55.044[0m | 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57489 entries, 0 to 57488
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   System Location Code          57489 non-null  object 
 1   Previous Cycle Plant Type     57489 non-null  object 
 2   Plant Type                    57489 non-null  object 
 3   Plant Stage                   57489 non-null  object 
 4   Temperature Sensor (°C)       48800 non-null  float64
 5   Humidity Sensor (%)           18622 non-null  float64
 6   Light Intensity Sensor (lux)  53211 non-null  float64
 7   CO2 Sensor (ppm)              57489 non-null  int64  
 8   EC Sensor (dS/m)              57489 non-null  float64
 9   O2 Sensor (ppm)               57489 non-null  int64  
 10  Nutrient N Sensor (ppm)       47515 non-null  object 
 11  Nutrient P Sensor (ppm)       51791 non-null  object 
 12  Nutrient K Sensor (ppm)       53788 non-null  object 
 13  p

[32m2025-03-14 09:16:55.270[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36m_handle_duplicates[0m:[36m129[0m - [1mFound 7489 duplicate rows[0m
[32m2025-03-14 09:16:55.331[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36m_handle_duplicates[0m:[36m135[0m - [1mRemoved duplicates. Keeping 'first' occurrences.[0m
[32m2025-03-14 09:16:55.333[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36m_handle_missing_values[0m:[36m143[0m - [1mHandling missing values[0m
[32m2025-03-14 09:16:55.352[0m | [1mINFO    [0m | [36mdata_cleaner[0m:[36m_handle_missing_values[0m:[36m157[0m - [1mMissing values before imputation:
Temperature Sensor (°C)          7584
Humidity Sensor (%)             33786
Light Intensity Sensor (lux)     3724
Nutrient N Sensor (ppm)          8672
Nutrient P Sensor (ppm)          4958
Nutrient K Sensor (ppm)          3219
Water Level Sensor (mm)          7514
dtype: int64[0m
[32m2025-03-14 09:16:55.396[0m | [1mINFO    [0m | [36mdata_cleaner

data/processed/processed_data.csv


[32m2025-03-14 09:16:56.645[0m | [1mINFO    [0m | [36mdata_preprocessor[0m:[36mpreprocess[0m:[36m61[0m - [1mPreprocessed data saved to /Users/theopenguino/Downloads/aiip5-Er-Qi-Yang-227J/data/processed/processed_data.csv[0m


Unnamed: 0,System Location Code_1,System Location Code_2,System Location Code_3,System Location Code_4,System Location Code_5,System Location Code_6,System Location Code_7,Previous Cycle Plant Type_1,Previous Cycle Plant Type_2,Previous Cycle Plant Type_3,Previous Cycle Plant Type_4,Plant Type_1,Plant Type_2,Plant Type_3,Plant Type_4,Plant Stage,Temperature Sensor (°C),Humidity Sensor (%),Light Intensity Sensor (lux),CO2 Sensor (ppm),EC Sensor (dS/m),O2 Sensor (ppm),Nutrient N Sensor (ppm),Nutrient P Sensor (ppm),Nutrient K Sensor (ppm),pH Sensor,Water Level Sensor (mm),nutrient_balance_index,environmental_stress_index,water_quality_score
0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,maturity,23.34,-0.056522,0.20278,-0.847537,0.47156,-2.249338,0.076917,0.96511,-0.287121,0.195733,0.444162,142.0,297.4,12.023449
1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,vegetative,24.16,-1.743223,0.127032,-0.841741,1.218336,-0.506341,0.076917,0.213315,-0.048783,1.194343,0.974884,143.0,294.46,13.21
2,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,maturity,23.84,-0.056522,0.832067,0.943482,-0.997903,1.236656,1.691884,1.990286,1.262078,0.94469,0.043681,199.666667,381.7,11.203333
3,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,vegetative,22.61,-0.056522,1.350647,1.5231,0.808814,-0.506341,0.004605,0.213315,1.211005,1.443995,0.043681,166.666667,418.854,11.52
4,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,seedling,22.88,-0.056522,-1.603508,-1.57206,2.735979,-1.377839,-2.333481,-2.110418,-0.848919,-1.302182,0.416162,82.666667,210.508,12.086667
