In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import 5 different classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Load data
df = pd.read_csv('player_stats.csv')
features = ['Win', 'K', 'HB', 'D', 'M', 'G', 'B', 'T', 'HO', 'FF', 'FA', 'GA', 'I50', 'AF', 'SC', 'CL', 'CG', 'R50']
simple_features = ["Win", "K", "HB", "D", "M", "G", "B", "T", "HO", "FF", "FA"]
df.columns

Index(['Year', 'Round', 'Team Name', 'Win', 'Player', 'K', 'HB', 'D', 'M', 'G',
       'B', 'T', 'HO', 'FF', 'FA', 'Match ID', 'Votes', 'GA', 'I50', 'AF',
       'SC', 'CL', 'CG', 'R50'],
      dtype='object')

In [3]:
# Split data into training and testing
train = df[df['Year'] <= 2020]
test = df[df['Year'] > 2020]

In [4]:
# create an iterator that includes all the 5 classifiers
classifiers = [
    LogisticRegression(),
    SVC(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    GradientBoostingClassifier()
]

In [5]:
# Train logistic regression model
logreg = LogisticRegression(max_iter=2000)
logreg.fit(train[simple_features], train['Votes'])
print("Training done, now predicting...")
Y_pred = logreg.predict(test[simple_features])
acc_log = round(logreg.score(train[simple_features], train['Votes']) * 100, 2)
print(acc_log)

Training done, now predicting...
93.39


In [44]:
# For each match ID, sort by log odds and take the top 3 and assign them 3, 2, 1 votes respectively
output = []
for match_id in test['Match ID'].unique():
    df = test[test['Match ID'] == match_id]
    df['log_odds'] = logreg.predict_log_proba(df[simple_features])[:, 1]
    df = df.sort_values(by='log_odds', ascending=False)
    df['LogReg Votes'] = [3, 2, 1] + [0] * (len(df) - 3)
    output.append(df)

# Concatenate all the dataframes into one
output = pd.concat(output)

# Calculate difference between votes and logreg votes
output['Difference'] = output['Votes'] - output['LogReg Votes']

# Only keep rows where the player has votes OR LogReg 
output = output[(output['Votes'] > 0 )| (output['LogReg Votes'] > 0)]

# Save output to csv
output.to_csv('logreg.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log_odds'] = logreg.predict_log_proba(df[simple_features])[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log_odds'] = logreg.predict_log_proba(df[simple_features])[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log_odds'] = logreg.predict_log_proba(df[simple_features])[:, 1

In [None]:
# Train SVM model
svc = SVC()
svc.fit(train[simple_features], train['Votes'])
Y_pred = svc.predict(test[simple_features])
acc_svc = round(svc.score(train[simple_features], train['Votes']) * 100, 2)
print(acc_svc)