In [1]:
import pandas as pd
from datetime import datetime, date
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
data = pd.read_json('sherdog/data/cumulative2.json')

In [3]:
wins = len(data[data['result'] == 'win'])
print('Dummy accuracy is {:.2f}%'.format(wins/len(data)*100))

Dummy accuracy is 50.00%


In [4]:
data.iloc[0]['fighter']

{'association': None,
 'birth': None,
 'cumulative': {'loss': {'loss': {'decision': 0.0,
    'knockout': 0.0,
    'submission': 0.0,
    'total': 0.0},
   'win': {'decision': 0.0, 'knockout': 0.0, 'submission': 0.0, 'total': 0.0}},
  'win': {'loss': {'decision': 0.0,
    'knockout': 0.0,
    'submission': 0.0,
    'total': 0.0},
   'win': {'decision': 0.0,
    'knockout': 0.0,
    'submission': 0.0,
    'total': 0.0}}},
 'history': {'fights': 0.0,
  'loss': {'decision': 0.0, 'knockout': 0.0, 'submission': 0.0, 'total': 0.0},
  'positions': 0.0,
  'time': 0.0,
  'win': {'decision': 0.0, 'knockout': 0.0, 'submission': 0.0, 'total': 0.0}},
 'id': '/fighter/Chris-Write-5647',
 'nationality': None,
 'since_last_fight': 0,
 'started': 1009584000000,
 'streak': {'loss': 0.0, 'win': 0.0}}

In [5]:
def get_time_fighting(x):
    pass
    

def parse_date(timeinmilis):
    try:
        timestamp = timeinmilis / 1000
        res = datetime.utcfromtimestamp(timestamp)
        return res
    except:
        return date(2000, 1, 1)
    

def clean_last_fight(x):
    if x <= 0.0:
        return 0.0
    else:
        return x

    
def get_label(event_timestamp):
    event_date = event_timestamp.date()
    if event_date >= date(2015, 1, 1):
        return 'test'
    else:
        return 'train'

    
def flatten(data):
    X = pd.DataFrame()
    X['target'] = data['result'].map({'win': 1, 'loss': 0})
    for fighter in ['fighter', 'opponent']:
        for key in ['started']:
            curr = '{}_{}'.format(fighter, key)
            first_fight_date = data[fighter].apply(lambda x: parse_date(x[key]))
            event_date = data['date']
            X[curr] = event_date - first_fight_date
            X[curr] = X[curr].apply(lambda x: clean_last_fight(x.days))
            
            
        for key in ['since_last_fight']:
            curr = '{}_{}'.format(fighter, key)
            X[curr] = data[fighter].apply(lambda x: x[key])

        # Cumulative stats
        stats = 'cumulative'
        for key in ['loss', 'win']:
            for result in ['loss', 'win']:
                for method in ['decision', 'knockout', 'submission', 'total']:
                    curr = '{}_{}_{}_{}_{}'.format(fighter, stats, key, result, method)
                    X[curr] = data[fighter].apply(lambda x: x[stats][key][result][method])
                    X[curr] = X[curr].apply(lambda x: float(x))

        # Historical stats
        stats = 'history'
        for result in ['loss', 'win']:
            for method in ['decision', 'knockout', 'submission', 'total']:
                curr = '{}_{}_{}_{}'.format(fighter, stats, result, method)
                X[curr] = data[fighter].apply(lambda x: x[stats][result][method])
                X[curr] = X[curr].apply(lambda x: float(x))
        
        # Streak
        stats = 'streak'
        for result in ['loss', 'win']:
            curr = '{}_{}_{}'.format(fighter, stats, result)
            X[curr] = data[fighter].apply(lambda x: x[stats][result])
            X[curr] = X[curr].apply(lambda x: float(x))
        
        stats = 'history'
        for key in ['fights', 'positions', 'time']:
            curr = '{}_{}_{}'.format(fighter, stats, key)
            X[curr] = data[fighter].apply(lambda x: x[stats][key])
            X[curr] = X[curr].apply(lambda x: float(x))
    
    return X


def get_data(data):
    data['set'] = data['date'].apply(get_label)
    train = data[data['set'] == 'train']
    test = data[data['set'] == 'test']
    
    X_train = flatten(train)
    y_train = X_train['target']
    X_train.drop(['target'], axis=1, inplace=True)
    X_train.fillna(0, inplace=True)
    
    X_test = flatten(test)
    y_test = X_test['target']
    X_test.drop(['target'], axis=1, inplace=True)
    X_test.fillna(0, inplace=True)
    
    print('{:.2f}% ({}) of fights in the training dataset'.format(
        len(train)/len(data)*100, len(train)))
    print('{:.2f}% ({}) of fights in the testing dataset'.format(
        len(test)/len(data)*100, len(test)))
    
    return X_train, X_test, y_train, y_test, train, test

In [6]:
X_train, X_test, y_train, y_test, train, test = get_data(data)

78.39% (335978) of fights in the training dataset
21.61% (92636) of fights in the testing dataset


In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

print('LogisticRegression training {:.2f}%'.format(
    accuracy_score(y_train, model.predict(X_train))*100))
print('LogisticRegression testing {:.2f}%'.format(
    accuracy_score(y_test, model.predict(X_test))*100))
feats = {feature: importance for feature, importance in zip(X_train.columns, model.coef_[0])}
importances = pd.DataFrame.from_dict(feats, orient='index')
importances = importances.rename(columns={0: 'Importance', 1: 'Feature'})
importances = importances.iloc[importances['Importance'].abs().argsort()].iloc[::-1]
importances.head(5)

LogisticRegression training 68.38%
LogisticRegression testing 67.20%


Unnamed: 0,Importance
fighter_streak_win,0.191498
opponent_streak_win,-0.191497
opponent_history_loss_decision,-0.138914
fighter_history_loss_decision,0.138914
opponent_history_loss_submission,0.10726


In [8]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

print('RandomForestClassifier training {:.2f}%'.format(
    accuracy_score(y_train, model.predict(X_train))*100))
print('RandomForestClassifier testing {:.2f}%'.format(
    accuracy_score(y_test, model.predict(X_test))*100))
feats = {feature: importance for feature, importance in zip(X_train.columns, model.feature_importances_)}
importances = pd.DataFrame.from_dict(feats, orient='index')
importances = importances.rename(columns={0: 'Importance', 1: 'Feature'})
importances = importances.iloc[importances['Importance'].abs().argsort()].iloc[::-1]
importances.head(5)

RandomForestClassifier training 94.03%
RandomForestClassifier testing 75.62%


Unnamed: 0,Importance
fighter_history_time,0.055831
opponent_history_time,0.055344
fighter_cumulative_win_win_total,0.043567
opponent_cumulative_win_win_total,0.038762
opponent_history_positions,0.037947


In [9]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
batch_size = 8192
model.fit(X_train.values, y_train, epochs=50, batch_size=batch_size, validation_split=0.2)

Train on 268782 samples, validate on 67196 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0dca4a7c50>

In [18]:
scores = model.evaluate(X_test.values, y_test, batch_size=batch_size)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 79.40%


In [12]:
test['predictions'] = model.predict(X_test.values)
test['fighterid'] = test['fighter'].apply(lambda x: x['id'])
test['opponentid'] = test['opponent'].apply(lambda x: x['id'])
results = test.drop(['fighter', 'opponent', 'set', 'id', 'location', 'details'], axis=1)
results.to_csv('results.csv', index=False)

In [13]:
for organization in ['Ultimate Fighting Championship', 
                     'Bellator MMA',
                     'Jungle Fight',
                     'Konfrontacja Sztuk Walki',
                     'Absolute Championship Berkut']:
    subset = results[results['organization'] == organization]
    score = accuracy_score(subset['result'].map({'win': 1, 'loss': 0}), subset['predictions'].apply(lambda x: round(x)))
    print('{} accuracy score: {:.2f}%'.format(organization, score*100))

Ultimate Fighting Championship accuracy score: 69.55%
Bellator MMA accuracy score: 73.71%
Jungle Fight accuracy score: 75.48%
Konfrontacja Sztuk Walki accuracy score: 69.54%
Absolute Championship Berkut accuracy score: 70.78%


In [14]:
results['year'] = results['date'].apply(lambda x: x.year)
for year in sorted(results['year'].unique().tolist()):
    subset = results[results['year'] == year]
    score = accuracy_score(subset['result'].map({'win': 1, 'loss': 0}), subset['predictions'].apply(lambda x: round(x)))
    print('{} accuracy score: {:.2f}%'.format(year, score*100))

2015 accuracy score: 76.59%
2016 accuracy score: 77.29%
2017 accuracy score: 75.81%
