In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pycaret[full]
!pip install markupsafe==2.0.1

In [2]:
import jinja2
import pandas as pd
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


In [None]:
!pip install numba==0.53
!pip install tornado==4.5

In [29]:
import joblib
import pickle

In [4]:
from pycaret.classification import *
from pycaret.clustering import *
from pycaret.anomaly import *

# Data

In [15]:
import pyarrow.parquet as pq
model_input = pd.read_parquet('gm_1.parquet')

model_input_sampled = model_input[model_input['field'].isin([1,0,2,5])].groupby('field').sample(frac=0.2)
model_input_non_sampled = model_input[model_input['field'].isin([8, 6, 4])].groupby('field').sample(frac=0.6)
print(pd.concat([model_input_sampled, model_input_non_sampled], ignore_index=True)['field'].value_counts())

model_input_sampled = pd.concat([model_input_sampled, model_input_non_sampled], ignore_index=True).drop(['x_coord', 'y_coord'], axis=1)

1.0    8653
0.0    7460
8.0    5719
6.0    4694
4.0    4633
2.0    3347
5.0    2849
Name: field, dtype: int64


# Classification

In [16]:
from pycaret.classification import *
exp_mclf00 = setup(data = model_input_sampled, target = 'field', session_id=123) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,field
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(37355, 35)"
5,Missing Values,False
6,Numeric Features,34
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='field',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_strate...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('cluste

In [17]:
get_config('X').columns.tolist()

INFO:logs:Initializing get_config()
INFO:logs:get_config(variable=X)
INFO:logs:Global variable: X returned as        blue_S1_n  green_S1_n  red_S1_n  nir_S1_n  swir_1_S1_n  swir_2_S1_n  \
0         0.0503      0.0630    0.0838    0.1231       0.2490       0.2143   
1         0.0784      0.1238    0.1949    0.3132       0.4078       0.3454   
2         0.0731      0.0983    0.1230    0.2410       0.2391       0.1786   
3         0.0755      0.1008    0.1266    0.2085       0.2937       0.2362   
4         0.0864      0.1130    0.1371    0.2585       0.2774       0.2082   
...          ...         ...       ...       ...          ...          ...   
37350     0.1196      0.1739    0.2434    0.3547       0.4021       0.3467   
37351     0.0323      0.0584    0.0902    0.1696       0.2182       0.1794   
37352     0.1025      0.1524    0.1438    0.1685       0.2028       0.1816   
37353     0.0821      0.1303    0.1627    0.3181       0.3016       0.2458   
37354     0.0560      0.0940    

['blue_S1_n',
 'green_S1_n',
 'red_S1_n',
 'nir_S1_n',
 'swir_1_S1_n',
 'swir_2_S1_n',
 'red_edge_1_S1_n',
 'red_edge_2_S1_n',
 'red_edge_3_S1_n',
 'sdev_S1_n',
 'bcdev_S1_n',
 'edev_S1_n',
 'NDVI_S1_n',
 'SAVI_S1_n',
 'NDMI_S1_n',
 'blue_S2_n',
 'green_S2_n',
 'red_S2_n',
 'nir_S2_n',
 'swir_1_S2_n',
 'swir_2_S2_n',
 'red_edge_1_S2_n',
 'sdev_S2_n',
 'bcdev_S2_n',
 'edev_S2_n',
 'NDVI_S2_n',
 'NDMI_S2_n']

In [18]:
features_pycaret = ['blue_S1_n',
 'green_S1_n',
 'red_S1_n',
 'nir_S1_n',
 'swir_1_S1_n',
 'swir_2_S1_n',
 'red_edge_1_S1_n',
 'red_edge_2_S1_n',
 'red_edge_3_S1_n',
 'sdev_S1_n',
 'bcdev_S1_n',
 'edev_S1_n',
 'NDVI_S1_n',
 'SAVI_S1_n',
 'NDMI_S1_n',
 'blue_S2_n',
 'green_S2_n',
 'red_S2_n',
 'nir_S2_n',
 'swir_1_S2_n',
 'swir_2_S2_n',
 'red_edge_1_S2_n',
 'sdev_S2_n',
 'bcdev_S2_n',
 'edev_S2_n',
 'NDVI_S2_n',
 'NDMI_S2_n']

In [19]:
model_input_sampled.columns[1:].tolist()

['blue_S1_n',
 'green_S1_n',
 'red_S1_n',
 'nir_S1_n',
 'swir_1_S1_n',
 'swir_2_S1_n',
 'red_edge_1_S1_n',
 'red_edge_2_S1_n',
 'red_edge_3_S1_n',
 'sdev_S1_n',
 'bcdev_S1_n',
 'edev_S1_n',
 'NDVI_S1_n',
 'LAI_S1_n',
 'EVI_S1_n',
 'SAVI_S1_n',
 'NDMI_S1_n',
 'blue_S2_n',
 'green_S2_n',
 'red_S2_n',
 'nir_S2_n',
 'swir_1_S2_n',
 'swir_2_S2_n',
 'red_edge_1_S2_n',
 'red_edge_2_S2_n',
 'red_edge_3_S2_n',
 'sdev_S2_n',
 'bcdev_S2_n',
 'edev_S2_n',
 'NDVI_S2_n',
 'LAI_S2_n',
 'EVI_S2_n',
 'SAVI_S2_n',
 'NDMI_S2_n']

In [20]:
[item for item in model_input_sampled.columns[1:].tolist() if item not in features_pycaret]

['LAI_S1_n',
 'EVI_S1_n',
 'red_edge_2_S2_n',
 'red_edge_3_S2_n',
 'LAI_S2_n',
 'EVI_S2_n',
 'SAVI_S2_n']

In [22]:
best = compare_models(include = ['rf', 'dt'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9297,0.9942,0.9277,0.9297,0.9294,0.9159,0.916,10.675
dt,Decision Tree Classifier,0.8269,0.8958,0.8219,0.8273,0.8269,0.7931,0.7932,0.884


INFO:logs:create_model_container: 2
INFO:logs:master_model_container: 2
INFO:logs:display_container: 3
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


In [23]:
print(best)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


In [28]:
evaluate_model(best)

INFO:logs:Initializing evaluate_model()
INFO:logs:evaluate_model(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False), fold=None, fit_kwargs=None, plot_kwargs=None, feature_name=None, groups=None, use_train_data=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [30]:
joblib.dump(best , 'rf_gm_1.joblib')

['rf_gm_1.joblib']

In [39]:
#joblib.dump(rf , 'rf_jblib')
rf_gm_1 = joblib.load('rf_gm_1.joblib')
rf_gm_1.predict(model_input_sampled[features_pycaret].values.tolist())

array([0., 0., 0., ..., 8., 8., 8.], dtype=float32)