# Create an enseble by using a soft voting classifier

In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
import config.ConnectionConfig as cc

In [2]:
lr_model = joblib.load("../models/tennis_lr_model.pkl")
rf_model = joblib.load("../models/tennis_rf_model.pkl")
svm_model = joblib.load("../models/tennis_svm_model.pkl")
nn_model = joblib.load("../models/tennis_nn_model.pkl")
scaler = joblib.load("../models/tennis_nn_scaler.pkl")

In [3]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [4]:
data = spark.read.csv('../data/final_train_df_spark.csv', header=True, inferSchema=True)
data = data.toPandas()
data

In [5]:
data = data.drop(['Surface_Carpet', 'Surface_Clay', 'Surface_Grass', 'Surface_Hard', 'Total time1', 'Total time2', 'Ranking at that time1', 'Ranking at that time2', 'round value1', 'round value2'], axis=1)

In [6]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X.shape, y.shape

In [7]:
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [9]:
voting_clf = VotingClassifier(estimators=[('lr', lr_model),
                                          #('rf', rf_model),
                                          ('nn', nn_model)
                                         ,('svm', svm_model)
                                          ], voting='soft')

In [10]:
voting_clf.fit(X_train, y_train)

In [11]:
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the soft voting classifier:", accuracy)

In [12]:
# Best params: {'mlp__validation_fraction': 0.2, 'mlp__solver': 'sgd', 'mlp__random_state': 42, 'mlp__max_iter': 300, 'mlp__learning_rate_init': 0.001, 'mlp__learning_rate': 'invscaling', 'mlp__hidden_layer_sizes': (100, 50, 100), 'mlp__epsilon': 1e-08, 'mlp__alpha': 0.01, 'mlp__activation': 'identity'}
final_estimator = MLPClassifier(hidden_layer_sizes=(100, 50, 100), activation='identity', validation_fraction=0.2, solver='sgd', random_state=42, max_iter=300, learning_rate_init=0.001, learning_rate='invscaling', epsilon=1e-08, alpha=0.01)
final_estimator.fit(X_train, y_train)

In [13]:
stacking_clf = StackingClassifier(estimators=[('voting', voting_clf)], final_estimator=final_estimator)

In [None]:
stacking_clf.fit(X_train, y_train)

In [None]:
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the stacking classifier:", accuracy)

In [None]:
# save the stacking classifier
joblib.dump(stacking_clf, "../models/tennis_stacking_model.pkl")