# SVM

In [23]:
import pandas as pd
#pre-process
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#svc
from sklearn.svm import SVC
#evaluate
from sklearn.metrics import accuracy_score, classification_report

In [24]:
# load the data
team_df= pd.read_parquet('parquet_data/team_df_missing_handled.parquet')
player_df= pd.read_parquet('parquet_data/player_df_missing_handled.parquet')

# make a success colon to be predicted
team_df['Success'] = team_df['Result'].apply(
    lambda x: 1 if isinstance(x, str) and x.startswith('W') else (0 if isinstance(x, str) and x.startswith('L') else None))
player_df['Success'] = player_df['Result'].apply(
    lambda x: 1 if isinstance(x, str) and x.startswith('W') else (0 if isinstance(x, str) and x.startswith('L') else None))

## SVM Based on The Team Data

In [25]:
# keep the numeric colons
X_team = team_df[["S", "Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Assists", "PTS"]]
y_team = team_df["Success"]  

scaler = StandardScaler()
X_scaled_team = scaler.fit_transform(X_team)

# apart the data as train and test set
X_train_team, X_test_team, y_train_team, y_test_team = train_test_split(X_scaled_team, y_team, test_size=0.2, random_state=42)

In [26]:
def svm_team(kernel, C, gamma):
    svm_model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)
    svm_model.fit(X_train_team, y_train_team)

    y_pred_svm_team = svm_model.predict(X_test_team)

    print("SVM Accuracy:", accuracy_score(y_test_team, y_pred_svm_team))
    print("SVM Classification Report:")
    print(classification_report(y_test_team, y_pred_svm_team))

#### Low - Defult - High C Values

In [27]:
svm_team(kernel='rbf', C=0.01, gamma='scale')

SVM Accuracy: 0.8567387922500354
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85      3507
           1       0.84      0.89      0.86      3564

    accuracy                           0.86      7071
   macro avg       0.86      0.86      0.86      7071
weighted avg       0.86      0.86      0.86      7071



In [28]:
svm_team(kernel='rbf', C=1.0, gamma='scale')

SVM Accuracy: 0.8725781360486494
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      3507
           1       0.87      0.88      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



In [29]:
svm_team(kernel='rbf', C=100.0, gamma='scale')

SVM Accuracy: 0.8657898458492433
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      3507
           1       0.87      0.87      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



* At a low C value (0.01), the model remains simpler, leading to a bit of underfitting and lower accuracy compared to C=1.0.
* The default C (1.0) gives the best results for this dataset, providing balanced and high accuracy.
* Contrary to expectations, a high C (100.0) has not led to a noticeable improvement in performance. In fact, accuracy has slightly decreased, likely because the model’s more complex boundaries have reduced its generalization ability.

#### Low - Defult - High Gamma Values:

In [30]:
svm_team(kernel='rbf', C=1.0, gamma=0.0001)

SVM Accuracy: 0.8547588742752086
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3507
           1       0.84      0.88      0.86      3564

    accuracy                           0.85      7071
   macro avg       0.86      0.85      0.85      7071
weighted avg       0.86      0.85      0.85      7071



In [31]:
svm_team(kernel='rbf', C=1.0, gamma=0.01)

SVM Accuracy: 0.8717295997737237
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3507
           1       0.87      0.88      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



In [32]:
svm_team(kernel='rbf', C=1.0, gamma=2)

SVM Accuracy: 0.841747984726347
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.84      3507
           1       0.83      0.87      0.85      3564

    accuracy                           0.84      7071
   macro avg       0.84      0.84      0.84      7071
weighted avg       0.84      0.84      0.84      7071



* A very low gamma (0.0001) cannot sufficiently sharpen the model’s discriminative power, resulting in lower accuracy.
* A medium-low gamma (0.01) seems to act as a balance point, producing better results.
* However, a high gamma (2) makes the model overly complex, causing it to lose its ability to generalize.

#### RBF - Linear - Poly - Sigmoid Kernels:

In [33]:
svm_team(kernel='rbf', C=1.0, gamma='scale')

SVM Accuracy: 0.8725781360486494
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      3507
           1       0.87      0.88      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



In [34]:
svm_team(kernel='linear', C=1.0, gamma='scale')

SVM Accuracy: 0.8694668363739216
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3507
           1       0.87      0.87      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



In [35]:
svm_team(kernel='poly', C=1.0, gamma='scale')

SVM Accuracy: 0.869325413661434
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3507
           1       0.87      0.87      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



In [36]:
svm_team(kernel='sigmoid', C=1.0, gamma='scale')

SVM Accuracy: 0.7865931268561731
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78      3507
           1       0.79      0.79      0.79      3564

    accuracy                           0.79      7071
   macro avg       0.79      0.79      0.79      7071
weighted avg       0.79      0.79      0.79      7071



* RBF Kernel, Accuracy: Approximately 87.3%. With the parameters we’ve examined so far, this shows that the RBF kernel provides the best performance on the dataset. The RBF kernel is generally quite versatile and can yield good results in non-linear classification scenarios.

* Linear Kernel, Accuracy: Approximately 86.9%. If your data isn’t easily linearly separable, it may not perform as well as the RBF kernel. Still, it’s not performing too poorly.

* Polynomial Kernel, Accuracy: Approximately %86.9 If the data structure allows for polynomial-based decision boundaries, a polynomial kernel can yield good results. Here, it performs about as well as the linear kernel, but falls slightly behind the RBF kernel. This might be because the polynomial degree is set to the default (degree=3) or because a polynomial approach may not be as suitable for the data as the RBF.

* Sigmoid Kernel: Generally, it is not considered one of the most popular choices for SVM. It seems the model cannot properly interpret the data structure, possibly resulting in an overly simple or overly complex decision boundary. Consequently, overall accuracy and metrics tend to decline.

## SVM Based on The Player Data
we will use the parameters that give the best results above

In [37]:
# keep the numeric colons
X_player = player_df[['S', 'Kills', 'Errors', 'Total Attacks', 'Hit Pct', 'Assists', 'SErr', 'Digs', 'Block Assists', 'PTS']]
y_player = player_df["Success"]  

scaler = StandardScaler()
X_scaled_p = scaler.fit_transform(X_player)

# apart the data as train and test set
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_scaled_p, y_player, test_size=0.2, random_state=42)

In [39]:
svm_model = SVC(kernel='rbf', C=1, gamma =0.01, random_state=42)
svm_model.fit(X_train_p, y_train_p)

y_pred_svm_p = svm_model.predict(X_test_p)

print("SVM Accuracy:", accuracy_score(y_test_p, y_pred_svm_p))
print("SVM Classification Report:")
print(classification_report(y_test_p, y_pred_svm_p))

SVM Accuracy: 0.6448628469404317
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.42      0.51      2682
           1       0.65      0.81      0.72      3479

    accuracy                           0.64      6161
   macro avg       0.64      0.62      0.62      6161
weighted avg       0.64      0.64      0.63      6161



## Conclusion
Result regarding team-based prediction: The dataset is balanced because the values in the support column are very close to each other (the support value represents the number of actual data points belonging to that class that the model uses for evaluation. This value helps you understand the size of the data on which the precision, recall, and f1-score are calculated). Additionally, the evaluation metrics are high, meaning this model is able to learn well from the given data.

Result regarding player-based prediction: The dataset is unbalanced; you can compare the values in the support column. Moreover, the evaluation metrics obtained using the player dataset are significantly lower compared to those obtained using the team dataset.

# Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

## LR Based on The Team Data

In [42]:
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train_team, y_train_team)

y_pred_log_team = log_model.predict(X_test_team)

print("Logistic Regression Accuracy:", accuracy_score(y_test_team, y_pred_log_team))
print("Logistic Regression Classification Report:")
print(classification_report(y_test_team, y_pred_log_team))

Logistic Regression Accuracy: 0.8683354546740206
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3507
           1       0.87      0.87      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



## LR Based on The Player Data

In [45]:
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train_p, y_train_p)

y_pred_log_p = log_model.predict(X_test_p)

print("Logistic Regression Accuracy:", accuracy_score(y_test_p, y_pred_log_p))
print("Logistic Regression Classification Report:")
print(classification_report(y_test_p, y_pred_log_p))

Logistic Regression Accuracy: 0.6333387437104366
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.45      0.52      2682
           1       0.65      0.77      0.70      3479

    accuracy                           0.63      6161
   macro avg       0.63      0.61      0.61      6161
weighted avg       0.63      0.63      0.62      6161

