In [46]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
np.random.seed(23)
pd.set_option('display.max_columns', None)

In [47]:
mobile_modelling = pd.read_csv('../Data/Data_modelling/mobile_modelling.csv')

X = mobile_modelling.iloc[:, 0:-1]
y = mobile_modelling.iloc[:, -1]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

mobile_df = pd.concat([X_train, y_train], axis=1)

# more than 0.02
columns_to_drop_1 = ['touch_screen', 'blue', 'dual_sim', 'four_g', 'wifi', 'three_g']
X_train_r = X_train.drop(columns=columns_to_drop_1)
X_val_r = X_val.drop(columns=columns_to_drop_1)


# 1. Stacking

Taking into consideration our first modelling and the results we obtained we plan to take into consideration 3 simple models that performed well on our data:
- SVC (for hard voting) / KNN (for soft voting)
- Decision Tree
- Random Forest Classifier

We will use soft voting as well as hard voting. We will also try to find the best weight combinations for chosen models combination since we think that focusing on the model with the best performance may be beneficial. We will check the results for data without removed columns and with removed columns that have feature importance greater than 0,02.

In [48]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# used models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

model1 = DecisionTreeClassifier(random_state=1)
model2 = RandomForestClassifier()
model3 = SVC(random_state=1, max_iter=1000, probability=True)
model33 = SVC(random_state=1, max_iter=1000)
model4 = KNeighborsClassifier()
estimators=[('DecisionTree', model1), ('RandomForest', model2), ('SVC', model33)]
estimators1=[('RandomForest', model2), ('KNN', model4), ('SVC', model3)]
estimators2=[('DecisionTree', model1), ('RandomForest', model2),('SVC', model3), ('KNN', model4)]

#### Hard voting

Let's see how it works for 3 models

In [49]:
model_hard = VotingClassifier(estimators=estimators, voting='hard')
model_hard.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_hardr = VotingClassifier(estimators=estimators, voting='hard')
model_hardr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9115646258503401
accuracy for dataset without chosen columns:  0.9013605442176871


Now for 4 models

In [50]:
model_hard = VotingClassifier(estimators=estimators2, voting='hard')
model_hard.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_hardr = VotingClassifier(estimators=estimators2, voting='hard')
model_hardr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


#### Soft voting

In [51]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft')
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


In [58]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft')
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


##### Weights

In [55]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft', weights=[0.2, 0.2, 0.6])
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


In [56]:
model_soft = VotingClassifier(estimators=estimators1, voting='soft', weights=[0.1, 0.1, 0.80])
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators1, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


In [59]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft', weights=[0.1, 0.3, 0.3, 0.3])
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators2, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


In [60]:
model_soft = VotingClassifier(estimators=estimators2, voting='soft', weights=[0.1, 0.2, 0.5, 0.4])
model_soft.fit(X_train,y_train)
y_hat = model_hard.predict(X_val)
print('accuracy for whole dataset: ', accuracy_score(y_val, y_hat))

model_softr = VotingClassifier(estimators=estimators2, voting='soft')
model_softr.fit(X_train_r,y_train)
y_hat = model_hardr.predict(X_val_r)
print('accuracy for dataset without chosen columns: ', accuracy_score(y_val, y_hat))

accuracy for whole dataset:  0.9047619047619048
accuracy for dataset without chosen columns:  0.9081632653061225


While testing different combinations of weights for soft voting we always observed the same accuracy which is very surprising anda bit counterintuitive.

# 2. Parameters tuning

### GridSearchCV

In [99]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

kernel=['linear', 'rbf']
C=[0.5, 1, 5, 20, 100]
gamma=[0.5, 1, 5, 10, 50]
param_grid = dict(kernel=kernel, C=C, gamma=gamma)

In [100]:
svm= SVC(probability=True)
grid = GridSearchCV(estimator=svm, param_grid=param_grid, cv = 3, scoring=['accuracy', 'roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.965307 using {'C': 0.5, 'gamma': 0.5, 'kernel': 'linear'}


### RandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

kernel=['linear', 'rbf']
C=uniform(0.1, 10)
gamma=uniform(0.01, 1.0)
param_grid = dict(kernel=kernel, C=C, gamma=gamma)

svm= SVC()
random = RandomizedSearchCV(estimator=svm, param_distributions=param_grid, cv = 5, scoring=['accuracy','roc_auc_ovo_weighted', 'f1_weighted'], n_jobs=-1, refit='accuracy')
random_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# 3. Crossvalidation

In [None]:
import sklearn
from sklearn.model_selection import cross_val_score

svm = SVC()

# w teorii przy grid/random searchu juz sie robi crossvalidation
# mozna wsm dopytac jak to wyglada z ta crossvalidacja

to do :
- szukanie hiperparametrow dla knn i random tree ?
- czy crossvalidation potrzebne jak przy hiperparametrach jest - napisac do Tomaszewskiej
- stacking - dodac z regresja liniowa i naive bayes
-  autoML - TPOT & H20