# Tennis match prediction with Logistic Regression

In [1]:
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import config.ConnectionConfig as cc

In [2]:
scaler = joblib.load('../models/tennis_nn_scaler.pkl')

In [3]:
cc.setupEnvironment()
spark = cc.startLocalCluster("Tennis Predictions")
spark.getActiveSession()

<pyspark.sql.session.SparkSession object at 0x000001D268021A60>


In [4]:
data = spark.read.csv('../data/final_train_df_spark.csv', header=True, inferSchema=True)
data = data.toPandas()

In [5]:
# drop round_value1 and 2, drop all surface columns
data = data.drop(['round value1', 'round value2', 'Break Points Faced1', 'Break Points Faced2', 'Second Serve Points Won1', 'Second Serve Points Won2', 'Dominance Ratio1', 'Dominance Ratio2', 'Ace Ratio1', 'Ace Ratio2'], axis=1)
data

Unnamed: 0,player_id1,age1,Ranking at that time1,Opponent Ranking at that time1,Double Fault Ratio1,First Serve Percentage1,First Serve Points Won1,Break Points Won1,Sets Won1,Sets Lost1,...,Opponent Ranking at that time2,Double Fault Ratio2,First Serve Percentage2,First Serve Points Won2,Break Points Won2,Sets Won2,Sets Lost2,Total time2,Surface,Result
0,50.0,11153,183.579075,173.094891,2.938200,61.517762,71.794161,3.802920,12.323601,12.218978,...,294.226190,3.715476,64.491667,71.958333,3.357143,12.583333,11.595238,115.136905,1,W
1,48.0,9242,138.016432,213.997653,4.911972,65.302817,65.993897,4.492958,11.715962,11.265258,...,283.840637,3.609960,58.554183,67.578088,4.147410,11.334661,11.007968,105.844622,1,L
2,268.0,11499,227.502273,250.547727,4.174318,63.618636,72.530000,3.654545,12.015909,11.718182,...,262.626016,2.959892,59.181572,71.897290,3.915989,11.696477,11.845528,99.970190,1,W
3,162.0,6951,319.224719,332.752809,6.312360,55.720225,66.247753,4.707865,11.162921,11.359551,...,272.577947,3.705894,64.976426,66.622053,4.309886,11.593156,11.283270,105.395437,2,W
4,388.0,11345,193.812598,237.478740,4.045669,58.822677,75.976850,3.278740,12.127559,12.022047,...,297.838150,4.021965,63.728324,66.829480,4.317919,10.965318,10.820809,104.236994,1,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62698,64.0,9086,119.188596,172.065789,3.547588,65.639474,69.003289,4.144737,12.109649,11.230263,...,223.383178,4.190467,59.079813,69.217009,4.091589,11.585047,11.409346,105.317757,1,W
62699,274.0,9115,223.504259,282.942078,5.813288,61.034923,65.421976,4.563884,11.194208,11.848382,...,158.442759,2.837517,61.341241,69.410483,4.226207,11.944828,12.337931,106.252414,2,L
62700,93.0,11446,51.121981,98.693237,5.101087,59.310507,67.326570,4.570048,12.321256,12.439614,...,106.368932,3.043107,59.361553,76.817087,3.601942,13.500971,12.085437,105.450485,1,W
62701,93.0,10130,51.121981,98.693237,5.101087,59.310507,67.326570,4.570048,12.321256,12.439614,...,166.466302,5.061020,64.122769,73.829508,3.759563,12.936248,12.187614,109.755920,2,W


In [6]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X.shape, y.shape

((62703, 23), (62703,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((50162, 23), (12541, 23), (50162,), (12541,))

In [8]:
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [9]:
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('log_reg', LogisticRegression())
])

param_grid = {
    'log_reg__C': [0.1, 0.01, 1, 10, 100, 1000],
    'log_reg__penalty': ['l1', 'l2'],
    'log_reg__solver': ['liblinear', 'saga'],
    'log_reg__max_iter': [100, 1000],
    'log_reg__class_weight': [None, 'balanced']
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
search.fit(X_train, y_train)
print(f'Best score: {search.best_score_}')
print(f'Best params: {search.best_params_}')

Best score: 0.6268490436901129
Best params: {'log_reg__C': 0.01, 'log_reg__class_weight': None, 'log_reg__max_iter': 1000, 'log_reg__penalty': 'l1', 'log_reg__solver': 'liblinear'}


In [10]:
#log_reg = LogisticRegression()
model = search.best_estimator_
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.6223586635834463

In [12]:
#joblib.dump(model, '../models/tennis_lr_model.pkl')