In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import mlflow
import mlflow.sklearn
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv("dataset/Merged_2014.csv")

In [3]:
dataset.head()

Unnamed: 0,Latitude,Longitude,Zone,NDVI,landuse,LST,NDBI,NDWI,Roughness,SAVI,Slope,SMI,solar_radiation,Suitable_Areas
0,45.47236,9.202701,zone4,0.412452,grass,121.681648,-0.198779,-0.405585,0,0.61856,0.0,0.172691,471.5671,0
1,45.47236,9.202971,zone4,0.413815,grass,121.681648,-0.200118,-0.406559,0,0.620604,0.0,0.174177,471.546965,0
2,45.47236,9.20324,zone4,0.415186,grass,121.681648,-0.201389,-0.407536,0,0.622661,0.0,0.175687,471.528761,0
3,45.47236,9.20351,zone4,0.416562,grass,121.681648,-0.202568,-0.408515,0,0.624726,0.0,0.177219,471.512966,0
4,45.47236,9.203779,zone4,0.417939,grass,121.681648,-0.203623,-0.40949,0,0.626792,0.0,0.178773,471.500123,0


In [4]:
numerical_cols = dataset.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Suitable_Areas')

In [5]:
categorical_cols = dataset.select_dtypes(include=['object']).columns.tolist()

In [6]:
for col in categorical_cols:
    mode_value = dataset[col].mode()[0]
    dataset[col].fillna(mode_value, inplace=True)

In [7]:
for col in numerical_cols:
    median_value = dataset[col].median()
    dataset[col].fillna(median_value, inplace=True)

In [8]:
dataset[numerical_cols] = dataset[numerical_cols].apply(lambda x: x.clip(lower=x.quantile(0.05), upper=x.quantile(0.95)))

In [9]:
log_columns = ["NDVI", "LST", "NDBI", "NDWI", "Roughness", "SAVI", "Slope", "SMI", "solar_radiation"]

In [10]:
for col in log_columns:
    dataset[col] = dataset[col].apply(lambda x: np.log(x) if x > 0 else x)

In [11]:
for col in categorical_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

In [12]:
X = dataset.drop(columns=['Suitable_Areas'])
y = dataset['Suitable_Areas']
RANDOM_SEED = 6

In [13]:
print("Y value counts :", y.value_counts())

Y value counts : Suitable_Areas
0    80436
1    37158
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED, stratify=y)

In [15]:
print("Y_Train value counts", y_train.value_counts())

Y_Train value counts Suitable_Areas
0    56305
1    26010
Name: count, dtype: int64


In [16]:
rf = RandomForestClassifier(random_state=RANDOM_SEED)
param_grid_forest = {
    'n_estimators': [200, 400, 700],
    'max_depth': [10, 20, 30],
    'criterion': ["gini", "entropy"],
    'max_leaf_nodes': [50, 100]
}

In [17]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

grid_forest = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_forest,
    cv=cv,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_forest = grid_forest.fit(X_train, y_train)

In [18]:
lr = LogisticRegression(random_state=RANDOM_SEED)
param_grid_log = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

In [None]:
grid_log = GridSearchCV(
    estimator=lr,
    param_grid=param_grid_log,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_log = grid_log.fit(X_train, y_train)

In [None]:
dt = DecisionTreeClassifier(random_state=RANDOM_SEED)

param_grid_tree = {
    "max_depth": [3, 5, 7, 9, 11, 13],
    'criterion': ["gini", "entropy"],
}

grid_tree = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_tree,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
model_tree = grid_tree.fit(X_train, y_train)

In [None]:
def eval_metrics(actual, pred):
    accuracy = metrics.accuracy_score(actual, pred)
    f1 = metrics.f1_score(actual, pred, pos_label=1)
    fpr, tpr, _ = metrics.roc_curve(actual, pred)
    auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='blue', label='ROC curve area = %0.2f' % auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel('False Positive Rate', size=14)
    plt.ylabel('True Positive Rate', size=14)
    plt.legend(loc='lower right')
    os.makedirs("plot", exist_ok=True)
    plt.savefig("plot/ROC_curve.png")
    plt.close()
    return accuracy, f1, auc

In [None]:
def model_metrics(model, X, y, name):
    pred = model.predict(X)
    accuracy, f1, auc = eval_metrics(y, pred)
    print("\n")
    print("Model --> ", model, name)
    print("Mean CV score", model.best_score_)
    print("Accuracy", accuracy)
    print("f1-score", f1)
    print("AUC", auc)

In [None]:
model_metrics(model_tree, X_test, y_test, "DecisionTreeClassifier")
model_metrics(model_log, X_test, y_test, "LogisticRegression")
model_metrics(model_forest, X_test, y_test, "RandomForestClassifier")

# My Analysis

Here is the ROC curve:

![ROC Curve](plot/ROC_curve.png)