In [1]:
import pandas as pd

In [5]:
!pip install pyarrow
!pip install fastparquet

Collecting pyarrow
  Downloading pyarrow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 5.6 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-9.0.0
Collecting fastparquet
  Downloading fastparquet-0.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.2 MB/s eta 0:00:01
Collecting cramjam>=2.3.0
  Downloading cramjam-2.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 110.5 MB/s eta 0:00:01
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.5.0 fastparquet-0.8.3


# Note : We keep perennial crops aside

# Custom modeling : cv or nested cross validation

In [2]:
%matplotlib inline

import json
import os
from collections import Counter
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import dump
from odc.io.cgroups import get_cpu_quota
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, balanced_accuracy_score, f1_score
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_score,
    cross_validate,
)
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [3]:
ncpus = round(get_cpu_quota())
print("ncpus = " + str(ncpus))

ncpus = 4


##### base experiment

In [18]:
# 0.2, 0.6
model_input = pd.read_parquet('./data/gm_1.parquet')

model_input_sampled = model_input[model_input['field'].isin([1,0,2,5])].groupby('field').sample(frac=0.2)
model_input_non_sampled = model_input[model_input['field'].isin([8, 6, 4])].groupby('field').sample(frac=0.6)
print(pd.concat([model_input_sampled, model_input_non_sampled], ignore_index=True)['field'].value_counts())
print(pd.concat([model_input_sampled, model_input_non_sampled], ignore_index=True).shape)

model_input_sampled = pd.concat([model_input_sampled, model_input_non_sampled], ignore_index=True)

1.0    22
0.0    19
2.0     8
5.0     7
8.0     5
4.0     4
6.0     4
Name: field, dtype: int64
(69, 37)


In [19]:
X = model_input_sampled.drop('field', axis=1).values
y = model_input_sampled[['field']].values.ravel()

In [20]:
# Create a list to store models
models = []



model_name = "RandomForest"

rf_param_grid = {
    "model__class_weight": ["balanced", None],
    "model__max_features": ["sqrt", "log2", None],
    "model__n_estimators": [200, 300, 400],
    "model__criterion": ["gini", "entropy"],
}

models.append((model_name, RandomForestClassifier(n_jobs=1), rf_param_grid))



model_name = "AdaBoostClassifier"

ab_param_grid = {
    "model__base_estimator": [DecisionTreeClassifier(max_depth=i) for i in [1, 3, 10]],
    "model__n_estimators": [10, 100, 1000],
    
    "model__learning_rate": [0.01, 0.1, 1],
}

models.append((model_name, AdaBoostClassifier(), ab_param_grid))

In [22]:
# Create empty lists to store outputs
results = {}
outer_cv_test_pairs = {}
pipelines = {}

# Only run a single trial for each algorithm, so set a single seed to use for selecting folds
cv_seed = 13
model_seed = 32

# Set number of splits to do
inner_cv_splits = 3
outer_cv_splits = 3

# Number of jobs to pass to the inner cross validation loop
n_jobs_outer = 3
n_jobs_inner = ncpus - n_jobs_outer

for name, model, p_grid in models:
    print(f"Running {name}")

    # Create the pipeline method to leverage
    pipeline = Pipeline(
            steps=[
                ("model", model),
            ]
        )

    pipelines[name] = pipeline

    # Create the outer_cv for each model so that the same data is fitted
    outer_cv = StratifiedKFold(
        n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed
    )

    # Create dictionary to store testing arrays for each model
    model_cv_test_pairs = {}
    model_best_estimators = {}

    # Loop over the outer split
    for outer_split_number, (train_index, test_index) in enumerate(
        outer_cv.split(X, y)
    ):
        print(f"running Outer Split {outer_split_number}")

        X_train, X_test = X[train_index, :], X[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        # Create inner cv for each outer cv
        inner_cv = StratifiedKFold(
            n_splits=inner_cv_splits, shuffle=True, random_state=cv_seed
        )

        # Create grid search
        clf = GridSearchCV(
            pipeline,
            param_grid=p_grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=n_jobs_inner,
        )

        print("    fitting inner CV loop")
        # Fit to training data
        clf.fit(X_train, y_train)

        # Calculate prediction
        best_model = clf.best_estimator_
        print("performing prediction")
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        test_f1_macro = f1_score(y_test, y_pred, average="macro")

        # Store the results
        model_best_estimators[f"split_{outer_split_number}"] = {
            "best_estimator": clf.best_estimator_,
            "f1_macro_score": test_f1_macro,
        }

        # Store the true and predicted arrays
        model_cv_test_pairs[f"split_{outer_split_number}"] = (y_test, y_pred)

    # Capture results out
    outer_cv_test_pairs[name] = model_cv_test_pairs
    results[name] = model_best_estimators

Running RandomForest
running Outer Split 0
    fitting inner CV loop




performing prediction
running Outer Split 1
    fitting inner CV loop
performing prediction
running Outer Split 2
    fitting inner CV loop




performing prediction
Running AdaBoostClassifier
running Outer Split 0
    fitting inner CV loop




performing prediction
running Outer Split 1
    fitting inner CV loop
performing prediction
running Outer Split 2
    fitting inner CV loop




performing prediction


In [None]:
# Get best estimated params for RF model
prefered_model = models[0]
outer_cv = StratifiedKFold(n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed)

metric = "f1_macro"
name, model, p_grid = prefered_model

# instatiate a gridsearchCV using outer cross-validation folds
clf = GridSearchCV(
    pipelines[name],
    p_grid,
    scoring=metric,
    verbose=1,
    cv=outer_cv.split(X, y),
    n_jobs=ncpus,
)

# Fit the gridsearch on outer cross-validation folds
clf.fit(X, y)

print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print("\n")
print("The " + metric + " score using these parameters is: ")
print(round(clf.best_score_, 2))

In [None]:
if remove_correlated_features:
    removed_cols = clf.best_estimator_["drop_corr_features"].to_drop
    remaining_cols = [col for col in columns_to_use if col not in removed_cols]
else:
    remaining_cols = columns_to_use

remaining_cols

In [None]:
# Transform data and fit new model
X_transformed = clf.best_estimator_["drop_corr_features"].transform(X)

new_model = clf.best_estimator_["model"]
new_model.fit(X_transformed, y)

# Create results directory if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

# Export the final model for use in following notebooks
dump(new_model, f"results/{experiment_name}_{name}.joblib")

# Export the columns to use in the final model
with open(
    f"results/{experiment_name}_{name}_features.json", "w", encoding="utf-8"
) as f:
    json.dump({"features": remaining_cols}, f, ensure_ascii=False, indent=4)