In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [14]:

# Load the historical air quality dataset
data = pd.read_csv('updated_aqi_dataset_with_plants_filled.csv')

# Prepare feature set and target variable
X = data[['PM2.5', 'PM10', 'NO', 'NO2', 'O3', 'SO2']]
y = data['Recommended_Plants'].apply(lambda x: x.split(', '))  # Split multiple plant recommendations

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25583 entries, 0 to 25582
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   City                25583 non-null  object 
 1   Date                25583 non-null  object 
 2   PM2.5               25583 non-null  float64
 3   PM10                25583 non-null  float64
 4   NO                  25583 non-null  float64
 5   NO2                 25583 non-null  float64
 6   NOx                 25583 non-null  float64
 7   NH3                 25583 non-null  float64
 8   CO                  25583 non-null  float64
 9   SO2                 25583 non-null  float64
 10  O3                  25583 non-null  float64
 11  Benzene             25583 non-null  float64
 12  Toluene             25583 non-null  float64
 13  Xylene              25583 non-null  float64
 14  AQI                 25583 non-null  float64
 15  AQI_Bucket          25583 non-null  object 
 16  Reco

In [None]:
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [16]:
# Train Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import classification_report

In [18]:
y_pred = model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=mlb.classes_)
print(report)

              precision    recall  f1-score   support

   Aloe Vera       0.77      0.77      0.77      1434
  Areca Palm       0.87      0.86      0.87      3021
 Boston Fern       0.77      0.64      0.70        92
       Ficus       0.82      0.81      0.81      2175
 Money Plant       0.62      0.26      0.37       275
        Neem       0.76      0.38      0.50       241
  Peace Lily       0.99      0.95      0.97       900
      Pothos       0.77      0.64      0.70        92
Rubber Plant       0.54      0.13      0.21        54
 Snake Plant       0.62      0.26      0.37       275
Spider Plant       0.85      0.86      0.86      2280
       Tulsi       0.76      0.38      0.50       241

   micro avg       0.84      0.79      0.81     11080
   macro avg       0.76      0.58      0.64     11080
weighted avg       0.84      0.79      0.80     11080
 samples avg       0.77      0.77      0.77     11080



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Function to predict recommended plants based on new input data
def predict_recommended_plants(input_data):
    # Transform input_data into DataFrame
    input_df = pd.DataFrame([input_data])
    # Make predictions
    predictions = model.predict(input_df)
    # Decode predictions back to plant names
    recommended_plants = mlb.inverse_transform(predictions)
    return recommended_plants

In [20]:
json_input = {
    'PM2.5': 159,
    'PM10': 84,
    'NO': 6.2,
    'NO2': 11.2,
    'O3': 55.6,
    'SO2': 17.3
}

# Get recommendations based on JSON input
recommended_plants = predict_recommended_plants(json_input)
print("Recommended Plants:", recommended_plants)

Recommended Plants: [('Areca Palm', 'Peace Lily', 'Spider Plant')]


In [21]:
import pickle

In [25]:
pickle_out = open("model.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()

# Save the MultiLabelBinarizer
mlb_out = open("mlb.pkl", "wb")
pickle.dump(mlb, mlb_out)
mlb_out.close()