In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import mlflow
import mlflow.sklearn
import os

In [2]:
dataset = pd.read_csv("dataset/Zone4_2023_labelled.csv")

In [3]:
numerical_cols = dataset.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('urban_farming')

In [4]:
categorical_cols = dataset.select_dtypes(include=['object']).columns.tolist()

In [5]:
for col in categorical_cols:
    mode_value = dataset[col].mode()[0]
    dataset[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(mode_value, inplace=True)


In [6]:
for col in numerical_cols:
    median_value = dataset[col].median()
    dataset[col].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [7]:
dataset[numerical_cols] = dataset[numerical_cols].apply(lambda x: x.clip(lower=x.quantile(0.05), upper=x.quantile(0.95)))

In [8]:
log_columns = ['GHI (kWh/m2)', 'NDBI', 'NDVI', 'BU', 'slope', 'surface_roughness', 'classes',
               'CO_column_number_density_x', 'NO2_column_number_density_x', 'O3_column_number_density_x', 
               'SO2_column_number_density_x', 'CO_column_number_density_y', 'NO2_column_number_density_y', 
               'O3_column_number_density_y', 'SO2_column_number_density_y', 'ST_B10', 'NDWI', 
               'soil_moisture', 'SAVI']

In [9]:
for col in log_columns:
    dataset[col] = dataset[col].apply(lambda x: np.log(x) if x > 0 else x)

In [10]:
drop_columns = ['CH4_column_volume_mixing_ratio_dry_air_x', 'tropospheric_HCHO_column_number_density_x', 
                'CH4_column_volume_mixing_ratio_dry_air_y', 'tropospheric_HCHO_column_number_density_y', 
                'cluster']
dataset.drop(columns=drop_columns, inplace=True)

In [11]:
for col in categorical_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

In [12]:
X = dataset.drop(columns=['urban_farming'])
y = dataset['urban_farming']
RANDOM_SEED = 6

In [13]:
print("Y value counts :", y.value_counts())

Y value counts : urban_farming
0    60751
1     2603
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED, stratify=y)

In [15]:
print("Y_Train value counts", y_train.value_counts())

Y_Train value counts urban_farming
0    42525
1     1822
Name: count, dtype: int64


In [16]:
rf = RandomForestClassifier(random_state=RANDOM_SEED)
param_grid_forest = {
    'n_estimators': [200, 400, 700],
    'max_depth': [10, 20, 30],
    'criterion': ["gini", "entropy"],
    'max_leaf_nodes': [50, 100]
}

In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_forest = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_forest,
    cv=cv,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_forest = grid_forest.fit(X_train, y_train)

In [19]:
lr = LogisticRegression(random_state=RANDOM_SEED)
param_grid_log = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

In [20]:
grid_log = GridSearchCV(
    estimator=lr,
    param_grid=param_grid_log,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_log = grid_log.fit(X_train, y_train)



In [21]:
dt = DecisionTreeClassifier(random_state=RANDOM_SEED)

param_grid_tree = {
    "max_depth": [3, 5, 7, 9, 11, 13],
    'criterion': ["gini", "entropy"],
}

grid_tree = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_tree,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_tree = grid_tree.fit(X_train, y_train)

In [24]:
def eval_metrics(actual, pred):
    accuracy = metrics.accuracy_score(actual, pred)
    f1 = metrics.f1_score(actual, pred, pos_label=1)
    fpr, tpr, _ = metrics.roc_curve(actual, pred)
    auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='blue', label='ROC curve area = %0.2f' % auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel('False Positive Rate', size=14)
    plt.ylabel('True Positive Rate', size=14)
    plt.legend(loc='lower right')
    os.makedirs("plot", exist_ok=True)
    plt.savefig("plot/ROC_curve.png")
    plt.close()
    return accuracy, f1, auc

In [30]:
def model_metrics(model, X, y, name):
    pred = model.predict(X)
    accuracy, f1, auc = eval_metrics(y, pred)
    print("\n")
    print("Model --> ", model, name)
    print("Mean CV score", model.best_score_)
    print("Accuracy", accuracy)
    print("f1-score", f1)
    print("AUC", auc)

In [31]:
model_metrics(model_tree, X_test, y_test, "DecisionTreeClassifier")
model_metrics(model_log, X_test, y_test, "LogisticRegression")
model_metrics(model_forest, X_test, y_test, "RandomForestClassifier")



Model -->  GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=6), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 9, 11, 13]},
             scoring='accuracy') DecisionTreeClassifier
Mean CV score 0.9902586501784778
Accuracy 0.989004051139054
f1-score 0.866111467008328
AUC 0.9299254220694417


Model -->  GridSearchCV(cv=5, estimator=LogisticRegression(random_state=6), n_jobs=-1,
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             scoring='accuracy') LogisticRegression
Mean CV score 0.9712946390039259
Accuracy 0.9698005997790288
f1-score 0.5980392156862745
AUC 0.7673321434547851


Model -->  GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=6, shuffle=True),
             estimator=RandomForestClassifier(random_state=6), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
              

# My Analysis

Here is the ROC curve:

![ROC Curve](plot/ROC_curve.png)