In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [13]:
df = pd.read_csv('preprocessed_dataa.csv')

In [14]:
df.dropna(inplace=True)

In [15]:
X = df.drop('PM2.5', axis=1)
y = df['PM2.5']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [18]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

In [19]:
rf.fit(X_train, y_train)


RandomForestRegressor(max_depth=10, random_state=42)

In [20]:
y_pred = rf.predict(X_test)

In [21]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.15873892159810896


In [22]:
rf = RandomForestRegressor(n_estimators=600, max_depth=10, random_state=42)

In [23]:
rf.fit(X_train, y_train)


RandomForestRegressor(max_depth=10, n_estimators=600, random_state=42)

In [24]:
y_pred = rf.predict(X_test)

In [25]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.1584061000886133


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [27]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [28]:
rf = RandomForestRegressor()

In [29]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

In [30]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   random_state=42, verbose=2)

In [31]:
print(rf_random.best_params_)

{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}


In [32]:
rf_model = RandomForestRegressor(n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features='log2', max_depth=20, bootstrap=True)

In [33]:
rf_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=20, max_features='log2', min_samples_leaf=2,
                      min_samples_split=5, n_estimators=300)

In [34]:
y_pred = rf_random.best_estimator_.predict(X_test)

In [35]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.15722181578368674
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   2.9s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   4.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.0s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.8s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=   7.7s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   4.

In [36]:
from sklearn.metrics import r2_score


In [37]:
y_test_pred = rf_random.predict(X_test)

In [38]:
r2 = r2_score(y_test, y_test_pred)

In [39]:
print("R-squared score:", r2)

R-squared score: 0.8474984224077591


In [41]:
from sklearn.metrics import classification_report

In [42]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

ValueError: continuous is not supported

In [45]:
from sklearn.metrics import r2_score

In [46]:
r2_score(y_test, y_pred)

0.8487027579803625

In [81]:
x_test_dt = [[10000,611110,100000222.1,0.1111114,1.2]]

In [82]:
df = pd.DataFrame(x_test_dt)

In [83]:
pred = rf_model.predict(df)



In [84]:
print (pred)

[0.91698819]
