# MACHINE LEARNING: CLASSIFICATION - MANAGING THE QUALITY METRIC OF GLOBAL ECOLOGICAL FOOTPRINT

In [20]:
#import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report

In [3]:
# load dataset
df = pd.read_csv("Data_for_UCI_named.csv")
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
df.shape

(10000, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [6]:
df.duplicated().sum()

0

In [7]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [9]:
# dropping "stab" column
df.drop("stab", axis=1, inplace=True)

In [10]:
# data split
X = df.drop("stabf", axis=1)
y = df["stabf"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Using ExtraTreesClassifier

In [11]:
extra_tree = make_pipeline(StandardScaler(),
                     ExtraTreesClassifier())
extra_tree.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('extratreesclassifier', ExtraTreesClassifier())])

In [12]:
extra_pred = extra_tree.predict(X_test)

In [45]:
accuracy_score(y_test, extra_pred)

0.929

In [13]:
feat_imp = extra_tree.named_steps['extratreesclassifier'].feature_importances_
feat_imp

array([0.11750447, 0.11821824, 0.11411551, 0.11539105, 0.03902638,
       0.04030931, 0.04086228, 0.03955691, 0.08867098, 0.09470185,
       0.0967531 , 0.09488993])

In [14]:
feat = pd.DataFrame(data = feat_imp, index=X.columns)
feat

Unnamed: 0,0
tau1,0.117504
tau2,0.118218
tau3,0.114116
tau4,0.115391
p1,0.039026
p2,0.040309
p3,0.040862
p4,0.039557
g1,0.088671
g2,0.094702


In [None]:
sorted (zip(feat_imp, X), reverse = True)

In [91]:
print("most important: ", feat.max())
print("\n")
print("least important: ", feat.min())

most important:  0    0.118048
dtype: float64


least important:  0    0.040059
dtype: float64


### Using Random Forest Classifier

In [94]:
random = make_pipeline(StandardScaler(),
                      RandomForestClassifier(random_state=1))
random.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=1))])

In [97]:
round(accuracy_score(y_test, random.predict(X_test)), 4)

0.929

### Using ExtraTreesClassifier

In [100]:
tree = make_pipeline(StandardScaler(),
                     ExtraTreesClassifier(random_state=1))

In [113]:
params_grid = {'extratreesclassifier__n_estimators': [50, 100, 300, 500, 1000],
              'extratreesclassifier__min_samples_leaf': [2, 5, 7],
              'extratreesclassifier__min_samples_split': [2, 4, 6, 8],
              'extratreesclassifier__max_features': ['auto', 'sqrt', 'log2', None]}

In [114]:
random_search = RandomizedSearchCV(estimator = tree,
                                   param_distributions= params_grid,
                                   random_state=1,
                                   cv = 5,
                                   n_iter=10,
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   verbose=1)

In [115]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('extratreesclassifier',
                                              ExtraTreesClassifier(random_state=1))]),
                   n_jobs=-1,
                   param_distributions={'extratreesclassifier__max_features': ['auto',
                                                                               'sqrt',
                                                                               'log2',
                                                                               None],
                                        'extratreesclassifier__min_samples_leaf': [2,
                                                                                   5,
                                                                                   7],
                                        'extratreescl

In [116]:
#checking for the best parameter for the model
random_search.best_params_

{'extratreesclassifier__n_estimators': 1000,
 'extratreesclassifier__min_samples_split': 6,
 'extratreesclassifier__min_samples_leaf': 2,
 'extratreesclassifier__max_features': None}

### F1 score

In [119]:
Precision = (255/ (255+1380)) 
Recall =  (255 /(255+45)) 
F1_Score = 2 * (Precision*Recall)/(Precision + Recall)
print(round(F1_Score,4))

0.2636


### Using XGBOOSTER

In [32]:
xgb = make_pipeline(StandardScaler(),
                   XGBClassifier(random_state=1, eval_metric='mlogloss'))
xgb.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='mlogloss', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=8, num_parallel_tree=1, predictor='auto',
                               random_state=1, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                    

In [41]:
round(accuracy_score(y_test, xgb.predict(X_test)), 4)

0.9455

### ExtraTreesClassifier model with RandomizedSearchCV

In [43]:
#experimenting with this generated parameter to test the model's performance
best_tree = make_pipeline(StandardScaler(),
                          ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, 
                                 min_samples_leaf=8, max_features=None))
best_tree.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('extratreesclassifier',
                 ExtraTreesClassifier(max_features=None, min_samples_leaf=8,
                                      n_estimators=1000))])

In [44]:
accuracy_score(y_test, best_tree.predict(X_test))

0.9265

### Using lgbm classifier

In [46]:
lgb = make_pipeline(StandardScaler(),
                    lgbm.LGBMClassifier(random_state=1))
lgb.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmclassifier', LGBMClassifier(random_state=1))])

In [47]:
round(accuracy_score(y_test, lgb.predict(X_test)), 4)

0.9395