# Predicting Tennis Matches with Random Forest Classifier

In [24]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import config.ConnectionConfig as cc

In [25]:
scaler = joblib.load('../models/tennis_nn_scaler.pkl')

In [26]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

In [27]:
data = spark.read.csv('../data/final_train_df_spark.csv', header=True, inferSchema=True)
data = data.toPandas()
data

In [28]:
#data = data.drop(['round value1', 'round value2', 'Break Points Faced1', 'Break Points Faced2', 'Second Serve Points Won1', 'Second Serve Points Won2', 'Dominance Ratio1', 'Dominance Ratio2', 'Ace Ratio1', 'Ace Ratio2'], axis=1)

In [29]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X.shape, y.shape

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [31]:
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [32]:
param_dist = {
    'n_estimators': [80, 100, 120, 150],
    'max_features': ['sqrt', 'log2'], 
    'max_depth': [20, 30, 40], 
    'min_samples_split': [3, 5, 7], 
    'min_samples_leaf': [1, 2, 3],  
    'bootstrap': [True] 
}

rf = RandomForestClassifier(n_estimators= 150, min_samples_split= 3, min_samples_leaf= 1, max_features= 'log2', max_depth= 30, bootstrap=True)
rf.fit(X_train, y_train)
#random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
#print(f'Best params: {random_search.best_params_}')
#print(f'Best score: {random_search.best_score_}')

In [33]:
#rf = random_search.best_estimator_
rf

In [34]:
from sklearn.metrics import accuracy_score

y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

In [35]:
#joblib.dump(rf, '../models/tennis_rf_model.pkl')

### Save this model as a zip file because it is too large for GitHub