In [25]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
from datetime import timedelta, time, datetime
from py2neo import Graph

In [127]:
#get fight data
fights = pd.read_csv('fight_hist.csv')

#ignore DQs
fights = fights[fights.method != 'DQ']

fights = fights.reset_index(drop = True)

#remove apostrophes from womens divisions
fights['division'] = fights.division.str.replace("'", '')

In [128]:
#connect to Neo4j graph (must have Neo4j instance up and running with default settings, password set to 123)
graph = Graph(password="123")

In [129]:
#load data into graph
tx = graph.begin()
for index, row in fights[(fights.result == 'W')].iterrows():
    tx.evaluate('''
       MERGE (a: fighter {name: $fighter})
       MERGE (b: fighter {name: $opponent})
       MERGE (b)-[r:lose_to {date: $date, division: $division, method: $method}]->(a)
    ''', parameters = {'fighter': row['fighter'], 'opponent': row['opponent'], 'date':row['date'], 
                       'method':row['method'], 'division':row['division']})
tx.commit()

In [130]:
#get pageranks for each division
divs = [d for d in fights.division.unique() if d not in ['Open Weight', 'Catch Weight', 'Super Heavyweight']]

In [131]:
pr = {}
tx = graph.begin()
for d in divs:
    cypher = '''
CALL algo.pageRank.stream(
"MATCH (fighter) RETURN id(fighter) as id",
"MATCH (opponent) -[lose_to]-> (fighter) where lose_to.division = '%s' RETURN id(opponent) as source, id(fighter) as target",
{graph:'cypher',iterations:50, dampingFactor:0.85})
YIELD nodeId, score
RETURN algo.asNode(nodeId).name AS fighter,score
ORDER BY score DESC''' %d
    dat = tx.run(cypher).data()
    pr[d] = dat

In [132]:
#convert data into dataframes for each division
for key in pr:
    temp = pd.DataFrame(pr[key])
    temp.columns = ['fighter', 'pagerank']
    pr[key] = pd.DataFrame(temp)

In [133]:
#get names of fighter that have fought in each division
div_fighters = {}
for d in divs:
    f_ls = fights[fights.division == d].fighter.unique()
    div_fighters[d] = f_ls

In [157]:
#pageranks
ranks = pr['Heavyweight']
ranks = ranks[ranks.fighter.isin(div_fighters['Heavyweight'])]
ranks.head(10).to_clipboard(index = False)
ranks.head(10)

Unnamed: 0,fighter,pagerank
0,Stipe Miocic,12.745001
1,Junior Dos Santos,10.708844
2,Francis Ngannou,7.423609
3,Derrick Lewis,6.569102
4,Cain Velasquez,6.246229
5,Stefan Struve,5.743494
6,Alistair Overeem,5.468971
7,Daniel Cormier,5.361267
8,Andrei Arlovski,5.286756
9,Fabricio Werdum,4.653989


In [135]:
#get win ratio
fights['ct'] =1 
total_fights = fights.pivot_table(index = ['division','fighter'], values = ['ct'], aggfunc = 'sum')
wins = fights[fights.result == 'W'].pivot_table(index = ['division','fighter'], values = ['ct'], aggfunc = 'sum') 

wr = pd.merge(total_fights, wins, left_index = True, right_index = True, how = 'left', copy = False)
wr.columns = ['total_fights', 'wins']
wr['win_ratio'] = wr.wins / wr.total_fights
wr = wr.fillna(0)
wr = wr.reset_index()

In [136]:
#scale page rank pageranks by win ratio
prs = {}

for d in pr:
    #get fighter win ratio in that division
    wrd = wr[wr.division == d]
    
    #df of fighter pageranks in a particular division, multiply pagerank by win ratio in that division
    temp = pr[d]
    temp = pd.merge(temp, wrd, on = 'fighter', how = 'left', copy = False)
    temp['pagerank'] = temp.pagerank * temp.win_ratio
    temp = temp[['fighter', 'pagerank']]
    temp = temp.sort_values('pagerank', ascending = False)
    temp = temp.reset_index(drop = True)
    prs[d] = temp

In [161]:
#pageranks scaled by win ratio
ranks = prs['Heavyweight']
ranks = ranks[ranks.fighter.isin(div_fighters['Heavyweight'])]
ranks.head(10).to_clipboard(index = False)
ranks.head(10)

Unnamed: 0,fighter,pagerank
0,Stipe Miocic,10.355313
1,Junior Dos Santos,8.031633
2,Francis Ngannou,6.073862
3,Cain Velasquez,4.996983
4,Derrick Lewis,4.637013
5,Daniel Cormier,4.289014
6,Alistair Overeem,3.418107
7,Stefan Struve,3.393883
8,Fabricio Werdum,3.199617
9,Andrei Arlovski,2.995828


In [138]:
#demonstrate how personalized pagerank has very different results depending on sourceNodes

#criteria A
print(wr[(wr.division == 'Lightweight') & (wr.total_fights >= 6) & (wr.win_ratio >= 0.9)].fighter.values)

#criteria B
print(wr[(wr.division == 'Lightweight') & (wr.total_fights >= 10) & (wr.win_ratio >= 0.9)].fighter.values)

['Gregor Gillespie' 'Khabib Nurmagomedov' 'Tony Ferguson']
['Khabib Nurmagomedov' 'Tony Ferguson']


In [167]:
#criteria A Lightweight rankings
tx = graph.begin()
cypher = '''
MATCH (bias) WHERE bias.name in ['Khabib Nurmagomedov', 'Tony Ferguson'] WITH collect(bias) as bias
CALL algo.pageRank.stream(
"MATCH (fighter) RETURN id(fighter) as id",
"MATCH (opponent) -[lose_to]-> (fighter) where lose_to.division = 'Lightweight' RETURN id(opponent) as source, id(fighter) as target",
{graph:'cypher',iterations:50, dampingFactor:0.85, sourceNodes:bias})
YIELD nodeId, score
RETURN algo.asNode(nodeId).name AS fighter,score
ORDER BY score DESC'''
dat = tx.run(cypher).data()
pd.DataFrame(dat).head(10).to_clipboard(index = False)
pd.DataFrame(dat).head(10)

Unnamed: 0,fighter,score
0,Khabib Nurmagomedov,0.17823
1,Tony Ferguson,0.16001
2,Michael Johnson,0.144926
3,Donald Cerrone,0.026529
4,Justin Gaethje,0.020523
5,Nate Diaz,0.020517
6,Beneil Dariush,0.01733
7,Myles Jury,0.016642
8,Paul Sass,0.015783
9,Reza Madadi,0.015398


In [141]:
#criteria B Lightweight rankings
tx = graph.begin()
cypher = '''
MATCH (bias) WHERE bias.name in ['Khabib Nurmagomedov', 'Tony Ferguson', 'Gregor Gillespie'] WITH collect(bias) as bias
CALL algo.pageRank.stream(
"MATCH (fighter) RETURN id(fighter) as id",
"MATCH (opponent) -[lose_to]-> (fighter) where lose_to.division = 'Lightweight' RETURN id(opponent) as source, id(fighter) as target",
{graph:'cypher',iterations:50, dampingFactor:0.85, sourceNodes:bias})
YIELD nodeId, score
RETURN algo.asNode(nodeId).name AS fighter,score
ORDER BY score DESC'''
dat = tx.run(cypher).data()
pd.DataFrame(dat).head(10).to_clipboard(index = False)
pd.DataFrame(dat).head(10)

Unnamed: 0,fighter,score
0,Khabib Nurmagomedov,0.17823
1,Tony Ferguson,0.16001
2,Gregor Gillespie,0.150945
3,Michael Johnson,0.144926
4,Donald Cerrone,0.026529
5,Justin Gaethje,0.020523
6,Nate Diaz,0.020517
7,Beneil Dariush,0.01733
8,Myles Jury,0.016642
9,Paul Sass,0.015783
