In [3]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
np.random.seed(23)
pd.set_option('display.max_columns', None)

In [4]:
mobile_modelling = pd.read_csv('../Data/Data_modelling/mobile_modelling.csv')

X = mobile_modelling.iloc[:, 0:-1]
y = mobile_modelling.iloc[:, -1]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

mobile_df = pd.concat([X_train, y_train], axis=1)

# more than 0.02
columns_to_drop_1 = ['touch_screen', 'blue', 'dual_sim', 'four_g', 'wifi', 'three_g']
X_train_r = X_train.drop(columns=columns_to_drop_1)
X_val_r = X_val.drop(columns=columns_to_drop_1)


# 1. Stacking

In [67]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# used models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

model1 = DecisionTreeClassifier(random_state=1)
model2 = RandomForestClassifier()
model3 = SVC(random_state=1, max_iter=1000, probability=True)
model33 = SVC(random_state=1, max_iter=1000)
model4 = KNeighborsClassifier()
estimators=[('DecisionTree', model1), ('RandomForest', model2), ('SVC', model33)]
estimators1=[('RandomForest', model2), ('KNN', model4), ('SVC', model3)]
estimators2=[('DecisionTree', model1), ('RandomForest', model2),('SVC', model3), ('KNN', model4)]

### Voting Classifiers

Taking into consideration our first modelling and the results we obtained we plan to take into consideration 4 simple models that performed well on our data:
- SVC
- Decision Tree
- Random Forest Classifier
- KNN 

We will use soft voting as well as hard voting. We will also try to find the best weight combinations for chosen models combination since we think that focusing on the model with the best performance may be beneficial. We will check the results for data without removed columns and with removed columns that have feature importance greater than 0,02.

#### Hard voting

Let's see how it works for 3 models

In [24]:
model_hard = VotingClassifier(estimators=estimators, voting='hard')
model_hard.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_hardr = VotingClassifier(estimators=estimators, voting='hard')
model_hardr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.8945578231292517


Now for 4 models

In [25]:
model_hard = VotingClassifier(estimators=estimators2, voting='hard')
model_hard.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_hardr = VotingClassifier(estimators=estimators2, voting='hard')
model_hardr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9149659863945578
accuracy for dataset without chosen columns:  0.9013605442176871


#### Soft voting

In [45]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft')
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9285714285714286
accuracy for dataset without chosen columns:  0.9285714285714286


In [46]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft')
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9115646258503401
accuracy for dataset without chosen columns:  0.9319727891156463


##### Weights

In [47]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft', weights=[0.2, 0.2, 0.6])
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9319727891156463
accuracy for dataset without chosen columns:  0.9251700680272109


In [49]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft', weights=[0.1, 0.1, 0.80])
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9421768707482994
accuracy for dataset without chosen columns:  0.9251700680272109


In [50]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft', weights=[0.1, 0.3, 0.3, 0.3])
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators2, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9285714285714286
accuracy for dataset without chosen columns:  0.9081632653061225


In [51]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft', weights=[0.1, 0.2, 0.5, 0.4])
model_soft.fit(X_train,y_train)
y_hat = model_soft.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators2, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_softr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9251700680272109
accuracy for dataset without chosen columns:  0.9115646258503401


While testing different combinations of weights for soft voting we can observe the best result for 

estimators1=[('RandomForest', model2),('KNN', model4), ('SVC', model3)]

with weights=[0.1, 0.1, 0.80].

### Stacking

In [69]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [70]:
clf = StackingClassifier(estimators=estimators1, final_estimator=LogisticRegression())
clf.fit(X_train, y_train).score(X_test, y_test)

0.9365079365079365

In [72]:
clf = StackingClassifier(estimators=estimators, final_estimator=KNeighborsClassifier())
clf.fit(X_train, y_train).score(X_test, y_test)

0.9126984126984127

In [73]:
clf = StackingClassifier(estimators=estimators2, final_estimator=KNeighborsClassifier())
clf.fit(X_train, y_train).score(X_test, y_test)

0.9444444444444444

In [74]:
clf = StackingClassifier(estimators=estimators2, final_estimator=RandomForestClassifier())
clf.fit(X_train, y_train).score(X_test, y_test)

0.9523809523809523

In [75]:
clf = StackingClassifier(estimators=estimators2, final_estimator=SVC())
clf.fit(X_train, y_train).score(X_test, y_test)

0.9444444444444444

# 2. Parameters tuning

### GridSearchCV

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [14]:
weights=['uniform', 'distance']
n_neighbors=[1, 3, 5]
param_grid = dict(weights=weights, n_neighbors=n_neighbors)

knn= KNeighborsClassifier()
grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv = 10, scoring=['accuracy', 'roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
grid_result = grid.fit(X_train, y_train)
print("KNN - Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

KNN - Best: 0.911224 using {'n_neighbors': 5, 'weights': 'distance'}


In [16]:
kernel=['linear', 'rbf']
C=[0.5, 1, 5, 20]
gamma=[0.5, 1, 5, 10]
param_grid = dict(kernel=kernel, C=C, gamma=gamma)

svm= SVC(probability=True)
grid = GridSearchCV(estimator=svm, param_grid=param_grid, cv = 5, scoring=['accuracy', 'roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
grid_result = grid.fit(X_train, y_train)
print("SVM - Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

SVM - Best: 0.970408 using {'C': 1, 'gamma': 0.5, 'kernel': 'linear'}


In [57]:
kernel=['linear', 'rbf']
C=[0.5, 1, 5, 20]
gamma=[0.5, 1, 5, 10]
param_grid = dict(kernel=kernel, C=C, gamma=gamma)

svm= SVC(probability=True)
grid = GridSearchCV(estimator=svm, param_grid=param_grid, cv = 5, scoring=['accuracy', 'roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
grid_result = grid.fit(X_train_r, y_train)
print("SVM - not all columns - Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

SVM - not all columns - Best: 0.965306 using {'C': 5, 'gamma': 0.5, 'kernel': 'linear'}


In [61]:
from xgboost import XGBClassifier

max_depth = [3, 4, 5, 7]
n_estimators = [50, 75, 100, 150, 200]
learning_rate = [0.01, 0.05, 0.1, 0.15, 0.2]
param_grid = dict(max_depth=max_depth, n_estimators = n_estimators, learning_rate=learning_rate)

xgb_model = XGBClassifier()
grid = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_result = grid.fit(X_train, y_train)
print("XGBoost - Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


XGBoost - Best: 0.883673 using {'learning_rate': 0.15, 'max_depth': 7, 'n_estimators': 200}


### RandomSearchCV

In [41]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [52]:
weights=['uniform', 'distance']
n_neighbors=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_grid = dict(weights=weights, n_neighbors=n_neighbors)

knn= KNeighborsClassifier()
random = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, cv = 10, scoring=['accuracy','roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
random_result = random.fit(X_train, y_train)
print("KNN - Best: %f using %s" % (random_result.best_score_, random_result.best_params_))

KNN - Best: 0.914286 using {'weights': 'distance', 'n_neighbors': 9}


In [53]:
kernel=['linear', 'rbf']
C=uniform(0.1, 10)
gamma=uniform(0.01, 1.0)
param_grid = dict(kernel=kernel, C=C, gamma=gamma)

svm= SVC(probability=True)
random = RandomizedSearchCV(estimator=svm, param_distributions=param_grid, cv = 10, scoring=['accuracy','roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
random_result = random.fit(X_train, y_train)
print("SVM - Best: %f using %s" % (random_result.best_score_, random_result.best_params_))

SVM - Best: 0.966327 using {'C': 3.8029793634382836, 'gamma': 0.7561077817534814, 'kernel': 'linear'}


to do :
- szukanie hiperparametrow dla knn (done) i random tree (to be done)
- stacking
- autoML - TPOT & H20

# 3. AutoML

### TPOT

In [None]:
#!pip install tpot

In [7]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=5, population_size=30, verbosity=2)
tpot.fit(X_train, y_train)

print(tpot.score(X_val, y_val))

#tpot.export('tpot_best_pipeline.py')

Optimization Progress:   0%|          | 0/180 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9071428571428571

Generation 2 - Current best internal CV score: 0.9071428571428571

Generation 3 - Current best internal CV score: 0.9071428571428571

Generation 4 - Current best internal CV score: 0.9071428571428571

Generation 5 - Current best internal CV score: 0.9081632653061223

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=13, p=2, weights=distance)
0.9285714285714286


### H2O

In [10]:
#!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.1-py2.py3-none-any.whl (265.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.6/265.6 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Installing collected packages: h2o
Successfully installed h2o-3.46.0.1


In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

mobile_df_train = pd.concat([X_train, y_train], axis=1)
mobile_df_val =  pd.concat([X_val, y_val], axis=1)
mobile_df_test = pd.concat([X_test, y_test], axis=1)
train_h2o = h2o.H2OFrame(mobile_df_train)
val_h2o = h2o.H2OFrame(mobile_df_val)
test_h2o = h2o.H2OFrame(mobile_df_test)
train_h2o['price_range'] = train_h2o['price_range'].asfactor()

aml = H2OAutoML(max_models=15, seed=1, sort_metric = 'accuracy')
aml.train(x=X_train.columns.tolist(), y='price_range', training_frame=train_h2o, validation_frame=val_h2o)

lb = aml.leaderboard
lb.head(rows=lb.nrows) 

perf = aml.leader.model_performance(test_data=test_h2o)
m = h2o.get_model("StackedEnsemble_BestOfFamily_1_AutoML_1_20240419_130852")


In [None]:
m = h2o.get_model("StackedEnsemble_BestOfFamily_1_AutoML_1_20240419_130852")

#h2o.shutdown()

In [42]:
print(lb)

model_id                                                   accuracy    mean_per_class_error    logloss      rmse        mse
XRT_1_AutoML_1_20240419_130852                             0.817347               0.182653    0.581563  0.442793  0.196065
DRF_1_AutoML_1_20240419_130852                             0.818367               0.181633    0.572689  0.437783  0.191654
GBM_grid_1_AutoML_1_20240419_130852_model_1                0.837755               0.162245    0.434699  0.36921   0.136316
DeepLearning_1_AutoML_1_20240419_130852                    0.857143               0.142857    0.343905  0.324455  0.105271
GBM_4_AutoML_1_20240419_130852                             0.859184               0.140816    0.366417  0.330534  0.109253
XGBoost_2_AutoML_1_20240419_130852                         0.860204               0.139796    0.339111  0.32338   0.104575
XGBoost_1_AutoML_1_20240419_130852                         0.864286               0.135714    0.353055  0.329554  0.108606
GBM_2_AutoML_1_

# 4. XAI

In [None]:
#!pip install lime
#!pip install pdpbox

In [None]:
import lime
import lime.lime_tabular
#from pdpbox import pdp
import pandas as pd
import numpy as np
from sklearn .inspection import PartialDependenceDisplay

svm = SVC(gamma=0.5, C=1, kernel='linear', probability=True)
svm.fit(X_train, y_train)

explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
                                                   mode='classification',
                                                   feature_names=X_train.columns.tolist(),
                                                   class_names=y_train.unique(),
                                                   discretize_continuous=True)

# Przygotowanie pojedynczej obserwacji do analizy
observation = X_train.iloc[3]

# Wyjaśnienie predykcji modelu SVC
predictions = svm.predict_proba([observation])
classifier_fn = lambda x: predictions
explanation = explainer.explain_instance(data_row=observation, predict_fn=svm.predict_proba, num_features=20)

# Wyświetlenie wyjaśnienia LIME
explanation.show_in_notebook()


In [None]:
target_class = 0
feature_index = [0] 
display = PartialDependenceDisplay.from_estimator(svm, X_train, features=feature_index, grid_resolution=50, target=target_class)
display.plot()
plt.show()

In [122]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import shap

svm = SVC(gamma=0.5, C=1, kernel='linear', probability=True)
svm.fit(X_train, y_train)

# Użyj biblioteki SHAP do obliczenia Partial Dependence Shapley Values
explainer = shap.Explainer(svm, X_train)
shap_values = explainer(X_train)

# Wygeneruj wykresy Partial Dependence Plots
shap.plots.partial_dependence(shap_values, features=['RM', 'LSTAT'])
plt.show()


ImportError: Numba needs NumPy 1.22 or greater. Got NumPy 1.21.