## Machine Learning: Classification - Managing the Quality Metric of Global Ecological Footprint
By: Volker Felvic Katche Tachin

20th December 2022

Note:
- Some answers might appear different from the ones chosen on the time quized, as these new answers were realised after failing some questions on the quiz.
- Also, many scores turned out to be different from those in the quiz answers even after multiple iterations


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, accuracy_score, roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.datasets import load_iris
import xgboost as xgb
import lightgbm as lgb

In [5]:
df = pd.read_csv('Data_for_UCI_named.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [7]:
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable


In [8]:
df.drop(labels = ['stab', 'p1'], axis = 1, inplace = True)

In [9]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [11]:
X_train.shape, X_test.shape

((8000, 11), (2000, 11))

### LogisticRegression

In [60]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

In [62]:
preds = logreg.predict(X_test)

In [63]:
accuracy_score(y_test, preds)

0.803

In [64]:
logreg.coef_

array([[ 0.31267515,  0.32804247,  0.32165156,  0.32963245,  0.12654021,
         0.03772428, -0.10682741,  2.68734051,  2.97942905,  3.08423991,
         2.88004386]])

### RandomForestClassifier

In [112]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
accuracy_score(y_test, rfc.predict(X_test))

0.921

### XGBClassifier

In [113]:
# encoding labels for XGBClassifier
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [114]:
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)
accuracy_score(y_test, xg.predict(X_test))

0.943

### LGBMClassifier

In [12]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

In [13]:
accuracy_score(y_test, model.predict(X_test))

0.9395

### ExtraTreesClassifier

In [None]:
ext = ExtraTreesClassifier()
ext.