In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
uci_data = pd.read_csv('Data_for_UCI_named.csv')

In [3]:
uci_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
uci_data.drop('stab', axis=1, inplace=True)

In [5]:
uci_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [6]:
x = uci_data.drop('stabf', axis=1)
y = uci_data['stabf']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
scaler = StandardScaler()
scaled_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
scaled_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [8]:
RFC = RandomForestClassifier(random_state=1)
RFC.fit(scaled_train, y_train)
pred_RFC = RFC.predict(scaled_test)
np.round(accuracy_score(y_test, pred_RFC), 4)

0.929

In [9]:
XGB = XGBClassifier(random_state=1, learning_rate=0.1, max_depth=3)
XGB.fit(scaled_train, y_train)
pred_XGB = XGB.predict(scaled_test)
np.round(accuracy_score(y_test, pred_XGB), 4)

0.9195

In [10]:
LGBM = LGBMClassifier(random_state=1)
LGBM.fit(scaled_train, y_train)
pred_LGBM = LGBM.predict(scaled_test)
np.round(accuracy_score(y_test, pred_LGBM), 4)

0.9375

In [11]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features}
ETC = ExtraTreesClassifier(random_state=1)
RCV = RandomizedSearchCV(estimator=ETC, param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=-1, random_state=1)
RCV.fit(scaled_train, y_train)
RCV.best_params_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.9min finished


{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [12]:
ETC_before = ExtraTreesClassifier(random_state=1)
ETC_before.fit(scaled_train, y_train)
pred_ETC_before = ETC_before.predict(scaled_test)
np.round(accuracy_score(y_test, pred_ETC_before), 4)

0.928

In [13]:
ETC_after = ExtraTreesClassifier(random_state=1, n_estimators=1000, min_samples_split=2, min_samples_leaf=8, max_features=None)
ETC_after.fit(scaled_train, y_train)
pred_ETC_after = ETC_after.predict(scaled_test)
np.round(accuracy_score(y_test, pred_ETC_after), 4)

0.927

In [18]:
cols = list(uci_data.columns[:-1])
feat_importance = pd.DataFrame({'Column_name':cols, 'Importance_score':ETC_after.feature_importances_})
feat_importance

Unnamed: 0,Column_name,Importance_score
0,tau1,0.13724
1,tau2,0.140508
2,tau3,0.13468
3,tau4,0.135417
4,p1,0.003683
5,p2,0.005337
6,p3,0.005429
7,p4,0.004962
8,g1,0.102562
9,g2,0.107578


In [19]:
feat_importance.sort_values('Importance_score', inplace=True, ascending=False)
feat_importance

Unnamed: 0,Column_name,Importance_score
1,tau2,0.140508
0,tau1,0.13724
3,tau4,0.135417
2,tau3,0.13468
10,g3,0.113063
11,g4,0.109541
9,g2,0.107578
8,g1,0.102562
6,p3,0.005429
5,p2,0.005337
