# Import data
Import sequenced data from a `json` file.

In [1]:
import pandas as pd
from datetime import date
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from pprint import pprint
import random
pd.options.mode.chained_assignment = None
%matplotlib inline

Using TensorFlow backend.


In [2]:
def get_data():
    data_file = 'fightmetric/data/imputed.json'
    frame = pd.read_json(data_file)
    return frame
    

def get_label(event_date):
    event_date = event_date.date()
    if event_date >= date(2015, 1, 1):
        return 'test'
    return 'train'


def train_test_split(data):
    data = data[(data['result'] == 'Win') | (data['result'] == 'Loss')]
    data['set'] = data['date'].apply(get_label)
    train = data.loc[data['set'] == 'train']
    test = data.loc[data['set'] == 'test']
    valid = data.loc[data['set'] == 'valid']
    assert len(data) == len(train) + len(test) + len(valid)
    
    return train, test, valid

def split(data):
    data = data[(data['result'] == 'Win') | (data['result'] == 'Loss')]
    y = data['result'].map({'Win': 1, 'Loss': 0})
    x = data.drop(['result'], axis=1)
    return x, y

# Data exploration

In [3]:
data = get_data()
train, test, valid = train_test_split(data)

In [4]:
data.iloc[0]['fighter']['name']

'Nick Diaz'

In [5]:
sample = data
sample['isgsp'] = sample['fighter'].apply(lambda x: 'St-Pierre' in x['name'])
sample = sample[sample['isgsp'] == True]
sample.drop(['isgsp'], axis=1, inplace=True)
sample.iloc[19].to_dict()

{'date': Timestamp('2009-01-31 00:00:00'),
 'event': 'UFC 94: St-Pierre vs Penn 2',
 'fighter': {'age': 27,
  'bonus': {'fight': 1, 'ko': 1, 'performance': 0, 'submission': 1},
  'current': {'attendance': 14885.0, 'position': 1, 'rounds': [5, 5, 5, 5, 5]},
  'height': 5.11,
  'history': {'attendance': 279335.4419063112,
   'draws': 0,
   'fights': 19,
   'losses': 2,
   'position': 47,
   'time': 83.75,
   'titlefights': 6,
   'wins': 17},
  'name': 'Georges St-Pierre',
  'reach': 76.0,
  'stance': 'Orthodox',
  'stats': {'body': {'avoided': 19.0,
    'landed': 110.0,
    'received': 54.0,
    'thrown': 126.0},
   'clinch': {'avoided': 30.0,
    'landed': 47.0,
    'received': 39.0,
    'thrown': 67.0},
   'distance': {'avoided': 310.0,
    'landed': 283.0,
    'received': 116.0,
    'thrown': 594.0},
   'ground': {'avoided': 19.0,
    'landed': 322.0,
    'received': 27.0,
    'thrown': 465.0},
   'head': {'avoided': 331.0,
    'landed': 465.0,
    'received': 104.0,
    'thrown': 899

In [6]:
keys = ['body', 'clinch', 'distance', 'ground', 'head', 
        'knockouts', 'leg', 'sig. str', 'submissions', 'td', 'total str.']
for key in keys:
    landed = train['fighter'].apply(lambda x: x['stats'][key]['landed'])
    thrown = train['fighter'].apply(lambda x: x['stats'][key]['thrown'])
    print("{}: {},".format(key, (landed.sum() / thrown.sum())))

body: 0.7473003379873279,
clinch: 0.6571087641322761,
distance: 0.3455062095114903,
ground: 0.6262766775935562,
head: 0.3628908924004376,
knockouts: 0.9343414859964508,
leg: 0.7791716658682076,
sig. str: 0.44421437047191575,
submissions: 0.2572297418162304,
td: 0.4768224733013465,
total str.: 0.5933736946615897,


In [7]:
for key in keys:
    avoided = train['fighter'].apply(lambda x: x['stats'][key]['avoided'])
    received = train['fighter'].apply(lambda x: x['stats'][key]['received'])
    total = avoided + received
    print("{}: {},".format(key, (avoided.sum() / total.sum())))

body: 0.2876221212962016,
clinch: 0.3741905657569338,
distance: 0.6827144917612437,
ground: 0.3980045751295038,
head: 0.6875411808215335,
knockouts: 0.3100940975192472,
leg: 0.24724693306549966,
sig. str: 0.6002153958424062,
submissions: 0.8462684763784285,
td: 0.6195488480886924,
total str.: 0.4412454210775378,


# Data preparation

In [8]:
data = get_data()
train, test, valid = train_test_split(data)
datasets = [train, test]
# pprint(data.iloc[random.randint(0, len(data))]['fighter'])

In [9]:
def offense_accuracy(fighter, key):
    landed = fighter['stats'][key]['landed']
    total = fighter['stats'][key]['thrown']
    
    return landed / (total + .1)

    
def defense_accuracy(fighter, key): 
    avoided = fighter['stats'][key]['avoided']
    received = fighter['stats'][key]['received']
    total = avoided + received
    
    return avoided / (total + .1)

    
def accuracy(dataset, key):
    
    def calc(fighter, key):
        offense = offense_accuracy(fighter, key)
        defense = defense_accuracy(fighter, key)
        return offense + defense
        
    
    fighter = dataset['fighter'].apply(lambda x: calc(x, key))
    opponent = dataset['opponent'].apply(lambda x: calc(x, key))
    return fighter - opponent


def probability(positive, both):
    try:
        return positive / both
    except Exception:
        return 0.5

    
def safe_divide(first, second):
    try:
        return first / second
    except Exception:
        return 1.0

In [10]:
for dataset in datasets:
    """
    Historical features
    """
    # Probabilities
    fighter = dataset['fighter'].apply(lambda x: probability(x['history']['wins'], x['history']['fights']))
    opponent = dataset['opponent'].apply(lambda x: probability(x['history']['wins'], x['history']['fights']))
    dataset['Win probability'] = fighter - opponent
    # Streak wins minus losses
    fighter = dataset['fighter'].apply(lambda x: x['streak']['wins'] - x['streak']['losses'])
    opponent = dataset['opponent'].apply(lambda x: x['streak']['wins'] - x['streak']['losses'])
    dataset['Streak'] = fighter - opponent
    # Time spent in the octagon
    fighter = dataset['fighter'].apply(lambda x: x['history']['time'])
    opponent = dataset['opponent'].apply(lambda x: x['history']['time'])
    dataset['Time'] = fighter - opponent
    # Avg position in the fightcard
    fighter = dataset['fighter'].apply(lambda x: x['history']['position'] / 
                                                (x['history']['fights'] + .1))
    opponent = dataset['opponent'].apply(lambda x: x['history']['position'] / 
                                                  (x['history']['fights'] + .1))
    dataset['Avg position'] = fighter - opponent
    # Is first fight
    dataset['Is first'] = dataset['fighter'].apply(lambda x: x['history']['fights'] > 0).astype(int)
    
    
    """
    Physiacal features
    """
    # Difference in age
    fighter = dataset['fighter'].apply(lambda x: x['age'])
    opponent = dataset['opponent'].apply(lambda x: x['age'])
    dataset['Age'] = fighter - opponent
    # Difference in reach
    fighter = dataset['fighter'].apply(lambda x: x['reach'])
    opponent = dataset['opponent'].apply(lambda x: x['reach'])
    dataset['Reach'] = fighter - opponent
    # Knockouts
    fighter = dataset['fighter'].apply(lambda x: x['stats']['knockouts']['landed'])
    opponent = dataset['opponent'].apply(lambda x: x['stats']['knockouts']['landed'])
    dataset['Knockouts'] = fighter - opponent
    # Submissions
    fighter = dataset['fighter'].apply(lambda x: x['stats']['submissions']['landed'])
    opponent = dataset['opponent'].apply(lambda x: x['stats']['submissions']['landed'])
    dataset['Submissions'] = fighter - opponent
    
    
    
    """
    Performance features
    """
    dataset['Striking'] = accuracy(dataset, 'sig. str')
    dataset['Takedowns'] = accuracy(dataset, 'td')

# Feature selection and scaling

In [11]:
from sklearn.preprocessing import Imputer, StandardScaler    

for dataset in datasets:
    for key in ['date', 'event', 'fighter', 'link', 'opponent', 
                'location', 'method', 'referee', 'set']:
        if key in dataset:
            dataset.drop([key], axis=1, inplace=True)
            

train = train.replace([np.inf, -np.inf], np.nan)
xtrain, ytrain = split(train)
imputer = Imputer(strategy='median')
imputer.fit(xtrain)
xtrain = pd.DataFrame(imputer.transform(xtrain), columns=xtrain.columns)
assert len(xtrain) == len(ytrain) == len(train)

test = test.replace([np.inf, -np.inf], np.nan) 
xtest, ytest = split(test)
xtest = pd.DataFrame(imputer.transform(xtest), columns=xtest.columns)
assert len(test) == len(ytest) == len(xtest)


scaler = StandardScaler()
scaler.fit(xtrain)

xtrain = pd.DataFrame(scaler.transform(xtrain), columns=xtrain.columns)
xtest = pd.DataFrame(scaler.transform(xtest), columns=xtest.columns)

In [12]:
for dataset in datasets:
    print('Dataset of {} features with {} rows.'.format(len(list(dataset.columns)),
                                                        len(dataset)))
    
xtrain.tail()

Dataset of 12 features with 9282 rows.
Dataset of 12 features with 2554 rows.


Unnamed: 0,Win probability,Streak,Time,Avg position,Is first,Age,Reach,Knockouts,Submissions,Striking,Takedowns
9277,-0.62738,-0.405441,-0.707055,-1.211207,-1.939072,-0.19927,-0.769245,-1.544806,0.0,-1.836196,-1.349384
9278,-1.463886,0.0,0.108441,-0.526321,0.515711,0.0,-1.153867,-0.386201,0.0,-0.605722,0.563001
9279,-2.927772,-1.621763,0.031608,0.282416,0.515711,0.597809,-2.692357,0.0,-0.435536,-0.669989,-0.607997
9280,-0.292777,-0.810881,-0.727479,-2.442069,-1.939072,-0.398539,0.0,0.0,-0.871072,-1.604566,-0.956377
9281,-1.25476,0.0,-0.897678,0.774565,0.515711,-0.597809,0.0,-0.386201,0.0,-0.387814,-1.303593


# Model selection

In [13]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(xtrain, ytrain)

print('LogisticRegression training {:.2f}%'.format(accuracy_score(ytrain, model.predict(xtrain))*100))
print('LogisticRegression testing {:.2f}%'.format(accuracy_score(ytest, model.predict(xtest))*100))

LogisticRegression training 65.03%
LogisticRegression testing 60.45%


In [14]:
model = SVC()
model.fit(xtrain, ytrain)

print('SVC training {:.2f}%'.format(accuracy_score(ytrain, model.predict(xtrain))*100))
print('SVC testing {:.2f}%'.format(accuracy_score(ytest, model.predict(xtest))*100))

SVC training 67.23%
SVC testing 60.06%


In [15]:
model = RandomForestClassifier()
model.fit(xtrain, ytrain)

print('RandomForestClassifier training {:.2f}%'.format(accuracy_score(ytrain, model.predict(xtrain))*100))
print('RandomForestClassifier testing {:.2f}%'.format(accuracy_score(ytest, model.predict(xtest))*100))

RandomForestClassifier training 95.77%
RandomForestClassifier testing 56.62%


In [16]:
input_dim = xtrain.shape[1]
model = Sequential()
model.add(Dense(256, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
batch_size = 128
model.fit(xtrain.values, ytrain, epochs=50, batch_size=batch_size, validation_split=0.2)

Train on 7425 samples, validate on 1857 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250

<keras.callbacks.History at 0x7f98dc2bd780>

In [18]:
scores = model.evaluate(xtest.values, ytest, batch_size=batch_size)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

 128/2554 [>.............................] - ETA: 0s
acc: 57.91%


In [19]:
data.iloc[0]['fighter']

{'age': 19,
 'bonus': {'fight': 0, 'ko': 0, 'performance': 0, 'submission': 0},
 'current': {'attendance': 20840.5157894737,
  'position': 2,
  'rounds': [5, 5, 5, 5, 5]},
 'height': 6.1,
 'history': {'attendance': 0.0,
  'draws': 0,
  'fights': 0,
  'losses': 0,
  'position': 0,
  'time': 0.0,
  'titlefights': 0,
  'wins': 0},
 'name': 'Nick Diaz',
 'reach': 76.0,
 'stance': 'Southpaw',
 'stats': {'body': {'avoided': 0.0,
   'landed': 0.0,
   'received': 0.0,
   'thrown': 0.0},
  'clinch': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'distance': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'ground': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'head': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'knockouts': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'leg': {'avoided': 0.0, 'landed': 0.0, 'received': 0.0, 'thrown': 0.0},
  'sig. str': {'avoided': 0.0, 'landed': 0.0, 'receive

In [26]:
def flatten(raw):
    data = raw.copy(deep=True)
    for fighter in ['fighter', 'opponent']:
        for a in data.iloc[0][fighter].keys():
            if a in ['history', 'current', 'bonus', 'streak']:
                for b in data.iloc[0][fighter][a]:
                    if b not in ['rounds']:
                        key = '{} {} {}'.format(fighter, a, b)
                        data[key] = data[fighter].apply(lambda x: x[a][b])
            elif a in ['stats']:
                for b in data.iloc[0][fighter][a]:
                    for c in data.iloc[0][fighter][a][b].keys():
                        key = '{} {} {} {}'.format(fighter, a, b, c)
                        data[key] = data[fighter].apply(lambda x: x[a][b][c])
            elif a in ['name', 'stance']:
                continue
            else:
                key = '{} {}'.format(fighter, a)
                data[key] = data[fighter].apply(lambda x: x[a])

    data.drop(['fighter', 'opponent',
               'referee', 'event', 'link', 'location',
               'method'], axis=1, inplace=True)
    train, test, _ = train_test_split(data)

    y_train = train['result']
    y_train = y_train.map({'Win': 1, 'Loss': 0})
    X_train = train.drop(['result', 'set', 'date'], axis=1)
    X_train.fillna(0, inplace=True)
    
    y_test = test['result']
    y_test = y_test.map({'Win': 1, 'Loss': 0})
    X_test = test.drop(['result', 'set', 'date'], axis=1)
    X_test.fillna(0, inplace=True)
    
    # Scale data
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [27]:
X_train, X_test, y_train, y_test = flatten(data)

In [28]:
wins = len(y_train[y_train == 1])
print('Dummy accuracy is {:.2f}%'.format(wins/len(y_train)*100))

Dummy accuracy is 50.00%


In [130]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [131]:
batch_size = 512
model.fit(X_train.values, y_train, epochs=100, batch_size=batch_size, validation_split=0.2)

Train on 7425 samples, validate on 1857 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<keras.callbacks.History at 0x7f98de6e1400>

In [132]:
scores = model.evaluate(X_test.values, y_test, batch_size=batch_size)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

 512/2554 [=====>........................] - ETA: 0s
acc: 57.40%


In [4]:
import pandas as pd
frame = pd.read_csv('fightmetric/data/data.csv')
frame.iloc[0].to_dict()

{'attendance': 7559.0,
 'birth': '1983-08-02',
 'bonus': "{'ko': False, 'submission': False, 'performance': False, 'fight': False}",
 'date': '2010-10-09',
 'event': 'Strikeforce - Diaz vs. Noons 2',
 'height': 6.1,
 'link': 'http://www.fightmetric.com/fight-details/1c41498b4aca8c39',
 'location': 'San Jose, California, USA',
 'method': 'Decision - Unanimous',
 'name': 'Nick Diaz',
 'position': 1,
 'reach': 76.0,
 'referee': 'Josh Rosenthal',
 'result': 'Win',
 'round': 5.0,
 'stance': 'Southpaw',
 'stats': "{'knockouts': {'thrown': 0.0, 'landed': 0.0, 'received': 0.0, 'avoided': 0.0}, 'total str.': {'thrown': 441.0, 'landed': 150.0, 'received': 139.0, 'avoided': 287.0}, 'td': {'thrown': 6.0, 'landed': 1.0, 'received': 0.0, 'avoided': 0.0}, 'submissions': {'thrown': 0.0, 'landed': 0.0, 'received': 0.0, 'avoided': 0.0}, 'sig. str': {'thrown': 421.0, 'landed': 130.0, 'received': 110.0, 'avoided': 279.0}, 'head': {'thrown': 369.0, 'landed': 95.0, 'received': 68.0, 'avoided': 221.0}, 'body

In [39]:
sample = frame
sample['isgsp'] = sample['name'].apply(lambda x: 'St-Pierre' in x)
sample = sample[sample['isgsp'] == True]
sample.drop(['isgsp'], axis=1, inplace=True)
sample.iloc[23].to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


{'attendance': 14885.0,
 'birth': '1981-05-19',
 'bonus': "{'ko': False, 'submission': False, 'performance': False, 'fight': False}",
 'date': '2009-01-31',
 'event': 'UFC 94: St-Pierre vs Penn 2',
 'height': 5.11,
 'link': 'http://www.fightmetric.com/fight-details/a158e4ff7ea93d43',
 'location': 'Las Vegas, Nevada, USA',
 'method': 'KO/TKO',
 'name': 'Georges St-Pierre',
 'position': 1,
 'reach': 76.0,
 'referee': 'Herb Dean',
 'result': 'Win',
 'round': 4.0,
 'stance': 'Orthodox',
 'stats': "{'knockouts': {'thrown': 0.0, 'landed': 1.0, 'received': 0.0, 'avoided': 0.0}, 'total str.': {'thrown': 358.0, 'landed': 310.0, 'received': 63.0, 'avoided': 22.0}, 'td': {'thrown': 7.0, 'landed': 4.0, 'received': 0.0, 'avoided': 1.0}, 'submissions': {'thrown': 0.0, 'landed': 0.0, 'received': 0.0, 'avoided': 0.0}, 'sig. str': {'thrown': 122.0, 'landed': 92.0, 'received': 16.0, 'avoided': 21.0}, 'head': {'thrown': 75.0, 'landed': 51.0, 'received': 8.0, 'avoided': 20.0}, 'body': {'thrown': 32.0, 'la