In [1]:
import mysql.connector
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import os
from dotenv import load_dotenv
import unicodedata
import re
import pickle

In [2]:
pd.set_option("display.max_columns", None)

Load data from MySQL

In [3]:
load_dotenv()

# Event data
conn = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    database=os.getenv("DB_NAME"),
    password=os.getenv("DB_PASSWORD")
    )

cursor = conn.cursor()

query = ("SELECT * FROM events")

cursor.execute(query)

columns = [desc[0] for desc in cursor.description]

rows = cursor.fetchall()

df_events = pd.DataFrame(rows, columns=columns)

cursor.close()
conn.close()

Preview dataset

In [4]:
df_events.tail()

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue
8346,8347,2025-09-13,UFC Fight Night: Lopes vs. Silva,Kelvin Gastelum,Dustin Stoltzfus,3,300,Middleweight,Decision - Unanimous,Kelvin Gastelum,Southpaw,Orthodox,1.0,0.0,117.0,146.0,57.0,64.0,118.0,151.0,58.0,69.0,0.0,0.0,0.0,1.0,0.0,6.0,23,120,30.0,25.0,81.0,93.0,16.0,25.0,24.0,34.0,11.0,14.0,12.0,19.0,56.0,55.0,116.0,134.0,1.0,4.0,1.0,5.0,0.0,5.0,0.0,7.0
8347,8348,2025-09-13,UFC Fight Night: Lopes vs. Silva,Diego Lopes,Jean Silva,2,288,Featherweight,KO/TKO,Diego Lopes,Orthodox,Orthodox,1.0,0.0,135.0,91.0,74.0,43.0,154.0,91.0,86.0,43.0,0.0,0.0,3.0,0.0,4.0,0.0,160,3,63.0,26.0,118.0,67.0,1.0,10.0,1.0,17.0,10.0,7.0,16.0,7.0,31.0,42.0,65.0,90.0,1.0,1.0,2.0,1.0,42.0,0.0,68.0,0.0
8348,8349,2025-09-13,UFC Fight Night: Lopes vs. Silva,Rafa Garcia,Jared Gordon,3,147,Lightweight,KO/TKO,Rafa Garcia,Orthodox,Orthodox,1.0,0.0,162.0,186.0,91.0,65.0,179.0,201.0,107.0,80.0,2.0,0.0,2.0,1.0,3.0,3.0,121,9,74.0,55.0,144.0,171.0,7.0,8.0,7.0,13.0,10.0,2.0,11.0,2.0,70.0,65.0,137.0,186.0,10.0,0.0,11.0,0.0,11.0,0.0,14.0,0.0
8349,8350,2025-09-13,UFC Fight Night: Lopes vs. Silva,Rob Font,David Martinez,3,300,Bantamweight,Decision - Unanimous,David Martinez,Orthodox,Orthodox,0.0,0.0,158.0,155.0,49.0,62.0,158.0,210.0,49.0,115.0,0.0,0.0,0.0,0.0,5.0,2.0,59,33,40.0,33.0,138.0,115.0,7.0,10.0,17.0,16.0,2.0,19.0,3.0,24.0,49.0,62.0,158.0,155.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8350,8351,2025-09-13,UFC Fight Night: Lopes vs. Silva,Alexander Hernandez,Diego Ferreira,2,226,Lightweight,KO/TKO,Alexander Hernandez,Orthodox,Orthodox,1.0,0.0,75.0,98.0,30.0,23.0,75.0,98.0,30.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,12,0,22.0,8.0,63.0,56.0,4.0,11.0,6.0,35.0,4.0,4.0,6.0,7.0,20.0,23.0,61.0,96.0,0.0,0.0,0.0,2.0,10.0,0.0,14.0,0.0


In [5]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8351 entries, 0 to 8350
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           8351 non-null   int64  
 1   event_date                   8351 non-null   object 
 2   event_name                   8351 non-null   object 
 3   fighter_red                  8330 non-null   object 
 4   fighter_blue                 8330 non-null   object 
 5   round                        8351 non-null   int64  
 6   time                         8351 non-null   int64  
 7   weight_class                 8351 non-null   object 
 8   win_method                   8351 non-null   object 
 9   winner                       8203 non-null   object 
 10  stance_red                   8309 non-null   object 
 11  stance_blue                  8275 non-null   object 
 12  knockdowns_red               8330 non-null   float64
 13  knockdowns_blue   

In [6]:
df_events.describe()

Unnamed: 0,id,round,time,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue
count,8351.0,8351.0,8351.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8351.0,8351.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0,8330.0
mean,4176.0,2.353491,228.068255,0.244178,0.183313,85.076471,80.065906,39.14934,34.617647,107.656783,98.743337,58.857743,50.915966,0.437095,0.318727,1.209844,0.906122,2.959424,2.690036,150.103101,110.757155,24.859184,21.59916,65.898679,62.238535,8.067227,7.178271,11.502281,10.516687,6.222929,5.840216,7.67551,7.310684,27.993277,26.157383,69.065066,67.734694,5.183313,4.70036,7.361945,6.832413,5.972749,3.759904,8.64946,5.4988
std,2410.870382,1.017717,97.290103,0.515732,0.462948,71.523906,69.553859,33.109188,31.39924,79.741669,77.187876,45.868001,42.553679,0.876278,0.74709,1.80497,1.535688,3.75895,3.71951,190.135555,153.100604,23.0779,21.913025,58.073792,56.937247,9.011648,8.251016,12.391474,11.436756,8.078072,7.640424,9.886654,9.379846,30.112646,28.118625,69.153606,66.35406,7.66597,6.787498,10.373756,9.241377,9.830627,7.836238,14.24606,11.05272
min,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2088.5,1.0,152.0,0.0,0.0,29.0,25.0,14.0,10.0,41.0,34.0,22.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,8.0,5.0,22.0,18.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,6.0,5.0,16.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,4176.0,3.0,291.0,0.0,0.0,68.0,63.0,31.0,26.0,96.0,86.0,51.0,42.0,0.0,0.0,1.0,0.0,1.0,1.0,71.0,43.0,19.0,15.0,52.0,47.0,5.0,5.0,8.0,7.0,3.0,3.0,4.0,4.0,18.0,17.0,47.0,47.0,2.0,2.0,4.0,3.0,2.0,1.0,3.0,1.0
75%,6263.5,3.0,300.0,0.0,0.0,122.0,117.0,56.0,50.0,157.0,146.0,84.0,74.0,1.0,0.0,2.0,1.0,4.0,4.0,226.0,163.0,35.0,31.0,93.0,90.0,12.0,10.0,17.0,15.0,9.0,8.0,11.0,10.0,41.0,38.0,102.0,100.0,7.0,6.0,10.0,9.0,8.0,4.0,11.0,6.0
max,8351.0,5.0,1080.0,5.0,6.0,744.0,510.0,445.0,241.0,746.0,567.0,447.0,529.0,10.0,7.0,21.0,12.0,30.0,49.0,1338.0,1300.0,274.0,187.0,553.0,437.0,117.0,92.0,133.0,112.0,78.0,95.0,101.0,102.0,439.0,225.0,737.0,504.0,95.0,78.0,115.0,89.0,97.0,136.0,141.0,163.0


In [7]:
df_events.isnull().sum()

id                               0
event_date                       0
event_name                       0
fighter_red                     21
fighter_blue                    21
round                            0
time                             0
weight_class                     0
win_method                       0
winner                         148
stance_red                      42
stance_blue                     76
knockdowns_red                  21
knockdowns_blue                 21
sig_attempts_red                21
sig_attempts_blue               21
sig_strikes_red                 21
sig_strikes_blue                21
total_strikes_attempts_red      21
total_strikes_attempts_blue     21
total_strikes_red               21
total_strikes_blue              21
sub_attempts_red                21
sub_attempts_blue               21
takedowns_red                   21
takedowns_blue                  21
takedown_attempts_red           21
takedown_attempts_blue          21
control_time_red    

Convert date to datetime

In [8]:
df_events['event_date'] = pd.to_datetime(df_events['event_date'], format="%Y-%m-%d")

Find Duplicates

In [9]:
df_events[df_events.duplicated(keep=False)]

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue


In [10]:
df_events = df_events.drop_duplicates()

Find missing values

In [11]:
missing_df = pd.DataFrame({
    'missing_count': df_events.isnull().sum(),
    'missing_percent': round((df_events.isnull().sum() / len(df_events)) * 100, 2)
}).sort_values(by='missing_percent', ascending=False)

missing_df

Unnamed: 0,missing_count,missing_percent
winner,148,1.77
stance_blue,76,0.91
stance_red,42,0.5
takedown_attempts_blue,21,0.25
distance_red,21,0.25
head_attempts_red,21,0.25
head_attempts_blue,21,0.25
body_strikes_red,21,0.25
body_strikes_blue,21,0.25
body_attempts_red,21,0.25


Remove null values from events that also on the official site do not have any stats.

Both fighters are null values.

In [12]:
df_events[(df_events['fighter_red'].isna()) & (df_events['fighter_blue'].isna())]

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue
22,23,1994-12-16,UFC 4: Revenge of the Warriors,,,1,14,Open Weight,Submission,Joe Charles,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
27,28,1994-12-16,UFC 4: Revenge of the Warriors,,,1,295,Open Weight,KO/TKO,Marcus Bossett,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
44,45,1995-07-14,UFC 6: Clash of the Titans,,,1,186,Open Weight,KO/TKO,Anthony Macias,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
47,48,1995-07-14,UFC 6: Clash of the Titans,,,1,121,Open Weight,KO/TKO,Joel Sutton,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
52,53,1995-09-08,UFC 7: The Brawl in Buffalo,,,1,326,Open Weight,KO/TKO,Onassis Parungao,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
61,62,1995-09-08,UFC 7: The Brawl in Buffalo,,,1,48,Open Weight,TKO - Doctor's Stoppage,Joel Sutton,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
62,63,1995-12-16,UFC - Ultimate Ultimate '95,,,1,329,Open Weight,Submission,Mark Hall,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
70,71,1995-12-16,UFC - Ultimate Ultimate '95,,,1,278,Open Weight,Submission,Joe Charles,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
71,72,1996-02-16,UFC 8: David vs Goliath,,,1,50,Open Weight,KO/TKO,Sam Adkins,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,
87,88,1996-07-12,UFC 10: The Tournament,,,2,180,Open Weight,Decision - Unanimous,Sam Adkins,,,,,,,,,,,,,,,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
df_events = df_events[~((df_events['fighter_red'].isna()) & (df_events['fighter_blue'].isna()))]

In [14]:
missing_df = pd.DataFrame({
    'missing_count': df_events.isnull().sum(),
    'missing_percent': round((df_events.isnull().sum() / len(df_events)) * 100, 2)
}).sort_values(by='missing_percent', ascending=False)

missing_df

Unnamed: 0,missing_count,missing_percent
winner,148,1.78
stance_blue,55,0.66
stance_red,21,0.25
id,0,0.0
leg_attempts_blue,0,0.0
head_strikes_blue,0,0.0
head_attempts_red,0,0.0
head_attempts_blue,0,0.0
body_strikes_red,0,0.0
body_strikes_blue,0,0.0


Drop fights where winner is null. These fight outcomes rarely happen and are usually contributed to fighters that can not continue the fight or draws.

In [15]:
df_events[df_events[['winner']].isnull().all(axis=1)].tail(10)

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue
7401,7402,2023-11-11,UFC 295: Prochazka vs. Pereira,Nazim Sadykhov,Viacheslav Borshchev,3,300,Lightweight,Decision - Majority,,Southpaw,Orthodox,1.0,0.0,207.0,241.0,91.0,143.0,228.0,242.0,109.0,144.0,0.0,0.0,4.0,0.0,7.0,0.0,212,10,65.0,87.0,172.0,178.0,19.0,30.0,26.0,34.0,7.0,26.0,9.0,29.0,66.0,141.0,175.0,239.0,5.0,2.0,8.0,2.0,20.0,0.0,24.0,0.0
7421,7422,2023-11-18,UFC Fight Night: Allen vs. Craig,Trey Ogden,Nikolas Motta,3,191,Lightweight,Overturned,,Orthodox,Orthodox,0.0,0.0,83.0,80.0,58.0,14.0,128.0,82.0,96.0,16.0,1.0,0.0,3.0,0.0,16.0,0.0,268,0,49.0,8.0,74.0,73.0,6.0,5.0,6.0,6.0,3.0,1.0,3.0,1.0,44.0,14.0,65.0,80.0,13.0,0.0,17.0,0.0,1.0,0.0,1.0,0.0
7487,7488,2024-02-03,UFC Fight Night: Dolidze vs. Imavov,Aliaskhab Khizriev,Makhmud Muradov,1,11,Middleweight,Could Not Continue,,Southpaw,Orthodox,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7494,7495,2024-02-10,UFC Fight Night: Hermansson vs. Pyfer,Daniel Marcos,Aoriqileng,2,208,Bantamweight,Could Not Continue,,Orthodox,Orthodox,0.0,0.0,126.0,95.0,81.0,45.0,129.0,96.0,84.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,28,38,50.0,26.0,84.0,71.0,12.0,10.0,19.0,12.0,19.0,9.0,23.0,12.0,69.0,43.0,113.0,93.0,5.0,2.0,6.0,2.0,7.0,0.0,7.0,0.0
7556,7557,2024-03-16,UFC Fight Night: Tuivasa vs. Tybura,Bryan Battle,Ange Loosa,2,60,Welterweight,Could Not Continue,,Orthodox,Orthodox,0.0,0.0,75.0,54.0,27.0,23.0,78.0,58.0,28.0,26.0,0.0,0.0,1.0,0.0,2.0,1.0,93,0,16.0,6.0,56.0,34.0,10.0,8.0,18.0,9.0,1.0,9.0,1.0,11.0,24.0,22.0,69.0,53.0,3.0,1.0,5.0,1.0,0.0,0.0,1.0,0.0
7732,7733,2024-07-13,UFC Fight Night: Namajunas vs. Cortez,Abdul Razak Alhassan,Cody Brundage,1,37,Middleweight,Could Not Continue,,Orthodox,Orthodox,0.0,0.0,30.0,2.0,23.0,1.0,35.0,2.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,27,3,23.0,1.0,30.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,4.0,0.0,4.0,0.0,19.0,0.0,26.0,0.0
8019,8020,2025-02-08,UFC 312: Du Plessis vs. Strickland 2,Jimmy Crute,Rodolfo Bellato,3,300,Light Heavyweight,Decision - Majority,,Orthodox,Orthodox,1.0,0.0,209.0,141.0,106.0,89.0,247.0,148.0,139.0,95.0,0.0,0.0,0.0,1.0,3.0,2.0,232,29,93.0,36.0,190.0,76.0,8.0,24.0,10.0,29.0,5.0,29.0,9.0,36.0,77.0,85.0,170.0,136.0,11.0,4.0,13.0,5.0,18.0,0.0,26.0,0.0
8204,8205,2025-06-14,UFC Fight Night: Usman vs. Buckley,Mansur Abdul-Malik,Cody Brundage,3,36,Middleweight,Overturned,,Orthodox,Orthodox,0.0,0.0,73.0,67.0,25.0,30.0,73.0,67.0,25.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,7,1,17.0,18.0,55.0,49.0,3.0,2.0,13.0,8.0,5.0,10.0,5.0,10.0,18.0,29.0,60.0,66.0,0.0,1.0,0.0,1.0,7.0,0.0,13.0,0.0
8208,8209,2025-06-14,UFC Fight Night: Usman vs. Buckley,Paul Craig,Rodolfo Bellato,1,299,Light Heavyweight,Could Not Continue,,Orthodox,Orthodox,0.0,0.0,27.0,20.0,17.0,10.0,52.0,27.0,40.0,16.0,0.0,0.0,0.0,1.0,6.0,1.0,34,55,7.0,2.0,16.0,11.0,10.0,2.0,11.0,2.0,0.0,6.0,0.0,7.0,14.0,9.0,23.0,17.0,2.0,0.0,3.0,0.0,1.0,1.0,1.0,3.0
8338,8339,2025-09-13,UFC Fight Night: Lopes vs. Silva,Zachary Reese,Sedriques Dumas,1,51,Middleweight,Could Not Continue,,Switch,Orthodox,0.0,0.0,7.0,10.0,2.0,3.0,7.0,10.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,3.0,8.0,1.0,0.0,3.0,0.0,1.0,2.0,1.0,2.0,2.0,3.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate the fights where there are no winners per year

In [16]:
# Extract year
df_events = df_events.copy()
df_events['year'] = df_events['event_date'].dt.year

# Total fights per year
total_fights = df_events.groupby('year').size()

# Total no-winner fights per year (count where winner is null)
no_winners = df_events.groupby('year')['winner'].apply(lambda x: x.isnull().sum())

# Average of no winners per year (ratio)
avg_no_winners = no_winners / total_fights

# Combine into summary DataFrame
df_summary = pd.DataFrame({
    'year': total_fights.index,
    'avg_no_winners': avg_no_winners.values,
    'total_fights': total_fights.values,
    'total_no_winners': no_winners.values
}).reset_index(drop=True)

df_summary

Unnamed: 0,year,avg_no_winners,total_fights,total_no_winners
0,1994,0.0,29,0
1,1995,0.058824,34,2
2,1996,0.0,36,0
3,1997,0.025641,39,1
4,1998,0.0,21,0
5,1999,0.045455,44,2
6,2000,0.023256,43,1
7,2001,0.025,40,1
8,2002,0.018868,53,1
9,2003,0.04878,41,2


In [17]:
df_events = df_events[~df_events[['winner']].isnull().all(axis=1)]

Fill missing stance values with the highest occuring one

In [18]:
df_events['stance_blue'].unique()

array([None, 'Southpaw', 'Orthodox', 'Open Stance', 'Sideways', 'Switch'],
      dtype=object)

In [19]:
df_events['stance_red'].unique()

array(['Southpaw', 'Orthodox', None, 'Sideways', 'Switch', 'Open Stance'],
      dtype=object)

In [20]:
df_events[df_events[['stance_blue']].isnull().all(axis=1)].tail(5)

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue,year
3592,3593,2016-02-27,UFC Fight Night: Silva vs Bisping,David Teymur,Martin Svensson,2,86,Lightweight,KO/TKO,David Teymur,Southpaw,,1.0,0.0,62.0,34.0,38.0,13.0,65.0,38.0,41.0,17.0,0.0,0.0,1.0,0.0,1.0,4.0,28,19,15.0,3.0,34.0,22.0,3.0,5.0,5.0,7.0,20.0,5.0,23.0,5.0,34.0,12.0,57.0,32.0,0.0,0.0,0.0,1.0,4.0,1.0,5.0,1.0,2016
3730,3731,2016-07-07,UFC Fight Night: Dos Anjos vs. Alvarez,Anthony Birchak,Dileno Lopes,3,300,Bantamweight,Decision - Split,Anthony Birchak,Orthodox,,0.0,0.0,112.0,87.0,41.0,25.0,175.0,98.0,101.0,35.0,0.0,0.0,0.0,2.0,0.0,12.0,139,310,16.0,22.0,74.0,74.0,18.0,2.0,29.0,10.0,7.0,1.0,9.0,3.0,30.0,18.0,86.0,75.0,9.0,7.0,21.0,12.0,2.0,0.0,5.0,0.0,2016
3743,3744,2016-07-08,The Ultimate Fighter: Team Joanna vs. Team Clá...,Gray Maynard,Fernando Bruno,3,300,Featherweight,Decision - Unanimous,Gray Maynard,Orthodox,,0.0,0.0,71.0,101.0,30.0,32.0,85.0,119.0,42.0,47.0,1.0,1.0,3.0,2.0,7.0,7.0,402,148,18.0,26.0,58.0,85.0,8.0,6.0,9.0,16.0,4.0,0.0,4.0,0.0,13.0,27.0,41.0,83.0,9.0,5.0,19.0,18.0,8.0,0.0,11.0,0.0,2016
3752,3753,2016-07-09,UFC 200: Tate vs Nunes,Sage Northcutt,Enrique Marin,3,300,Lightweight,Decision - Unanimous,Sage Northcutt,Orthodox,,0.0,0.0,66.0,36.0,41.0,16.0,135.0,48.0,106.0,26.0,1.0,2.0,0.0,4.0,2.0,10.0,262,448,36.0,12.0,59.0,32.0,3.0,2.0,5.0,2.0,2.0,2.0,2.0,2.0,14.0,12.0,33.0,31.0,13.0,2.0,17.0,3.0,14.0,2.0,16.0,2.0,2016
3801,3802,2016-08-06,UFC Fight Night: Rodriguez vs. Caceres,Teruto Ishihara,Horacio Gutierrez,1,152,Featherweight,KO/TKO,Teruto Ishihara,Southpaw,,1.0,0.0,25.0,18.0,13.0,5.0,25.0,18.0,13.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,6,0,7.0,1.0,17.0,14.0,3.0,0.0,5.0,0.0,3.0,4.0,3.0,4.0,8.0,5.0,19.0,18.0,0.0,0.0,0.0,0.0,5.0,0.0,6.0,0.0,2016


Build a fighter dataframe from both red and blue stance columns

In [21]:
df_fighters_red = df_events[['fighter_red', 'stance_red']].rename(
    columns={'fighter_red': 'fighter', 'stance_red': 'stance'}
)
df_fighters_blue = df_events[['fighter_blue', 'stance_blue']].rename(
    columns={'fighter_blue': 'fighter', 'stance_blue': 'stance'}
)

# Combine into one fighter dataframe
df_fighters = pd.concat([df_fighters_red, df_fighters_blue], ignore_index=True)

# Drop duplicate fighters (keep their first known stance)
df_fighters = df_fighters.drop_duplicates(subset=['fighter'], keep='first')

# Count unique stances across fighters
stance_counts = df_fighters['stance'].value_counts(dropna=True)

print("Unique stance counts across fighters:\n", stance_counts)

Unique stance counts across fighters:
 stance
Orthodox       1938
Southpaw        436
Switch          150
Open Stance       6
Sideways          3
Name: count, dtype: int64


In [22]:
top_stances = stance_counts.index[:1].tolist()

df_events['stance_red'] = df_events['stance_red'].fillna(top_stances[0])
df_events['stance_blue'] = df_events['stance_blue'].fillna(top_stances[0])

In [23]:
df_events.isnull().sum()

id                             0
event_date                     0
event_name                     0
fighter_red                    0
fighter_blue                   0
round                          0
time                           0
weight_class                   0
win_method                     0
winner                         0
stance_red                     0
stance_blue                    0
knockdowns_red                 0
knockdowns_blue                0
sig_attempts_red               0
sig_attempts_blue              0
sig_strikes_red                0
sig_strikes_blue               0
total_strikes_attempts_red     0
total_strikes_attempts_blue    0
total_strikes_red              0
total_strikes_blue             0
sub_attempts_red               0
sub_attempts_blue              0
takedowns_red                  0
takedowns_blue                 0
takedown_attempts_red          0
takedown_attempts_blue         0
control_time_red               0
control_time_blue              0
head_strik

Convert numerical values to int

In [24]:
for col in df_events.select_dtypes(include=['number']).columns:
    df_events[col] = df_events[col].astype('int64')

Convert winner column to 1 if winner is red or 0 if winner is blue

In [25]:
df_events['winner'] = (df_events['winner'] == df_events['fighter_red']).astype(int)
df_events.tail()

Unnamed: 0,id,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue,year
8346,8347,2025-09-13,UFC Fight Night: Lopes vs. Silva,Kelvin Gastelum,Dustin Stoltzfus,3,300,Middleweight,Decision - Unanimous,1,Southpaw,Orthodox,1,0,117,146,57,64,118,151,58,69,0,0,0,1,0,6,23,120,30,25,81,93,16,25,24,34,11,14,12,19,56,55,116,134,1,4,1,5,0,5,0,7,2025
8347,8348,2025-09-13,UFC Fight Night: Lopes vs. Silva,Diego Lopes,Jean Silva,2,288,Featherweight,KO/TKO,1,Orthodox,Orthodox,1,0,135,91,74,43,154,91,86,43,0,0,3,0,4,0,160,3,63,26,118,67,1,10,1,17,10,7,16,7,31,42,65,90,1,1,2,1,42,0,68,0,2025
8348,8349,2025-09-13,UFC Fight Night: Lopes vs. Silva,Rafa Garcia,Jared Gordon,3,147,Lightweight,KO/TKO,1,Orthodox,Orthodox,1,0,162,186,91,65,179,201,107,80,2,0,2,1,3,3,121,9,74,55,144,171,7,8,7,13,10,2,11,2,70,65,137,186,10,0,11,0,11,0,14,0,2025
8349,8350,2025-09-13,UFC Fight Night: Lopes vs. Silva,Rob Font,David Martinez,3,300,Bantamweight,Decision - Unanimous,0,Orthodox,Orthodox,0,0,158,155,49,62,158,210,49,115,0,0,0,0,5,2,59,33,40,33,138,115,7,10,17,16,2,19,3,24,49,62,158,155,0,0,0,0,0,0,0,0,2025
8350,8351,2025-09-13,UFC Fight Night: Lopes vs. Silva,Alexander Hernandez,Diego Ferreira,2,226,Lightweight,KO/TKO,1,Orthodox,Orthodox,1,0,75,98,30,23,75,98,30,23,0,0,0,0,0,0,12,0,22,8,63,56,4,11,6,35,4,4,6,7,20,23,61,96,0,0,0,2,10,0,14,0,2025


Normalize weight classes

In [26]:
df_events["weight_class"].value_counts()

weight_class
Lightweight                                          1331
Welterweight                                         1275
Middleweight                                         1030
Featherweight                                         771
Bantamweight                                          687
                                                     ... 
Ultimate Fighter 10 Heavyweight Tournament Title        1
Ultimate Fighter 9 Lightweight Tournament Title         1
Ultimate Fighter 9 Welterweight Tournament Title        1
Ultimate Fighter 8 Lightweight Tournament Title         1
Ultimate Fighter 33 Welterweight Tournament Title       1
Name: count, Length: 120, dtype: int64

In [27]:
def clean_text(val: str) -> str:
    if not isinstance(val, str):
        return val
    val = val.strip().lower()  # lowercase + trim
    val = re.sub(r"\s+", " ", val)  # collapse multiple spaces
    val = val.replace("womens", "women's")  # normalize missing apostrophe
    val = val.replace("women ", "women's ") # normalize if missing "'s"
    return val

# master map of cleaned values → canonical
mapping = {
    "lightweight": "Lightweight",
    "welterweight": "Welterweight",
    "middleweight": "Middleweight",
    "featherweight": "Featherweight",
    "bantamweight": "Bantamweight",
    "heavyweight": "Heavyweight",
    "light heavyweight": "Light Heavyweight",
    "flyweight": "Flyweight",
    "women's strawweight": "Women's Strawweight",
    "women's flyweight": "Women's Flyweight",
    "women's bantamweight": "Women's Bantamweight",
    "open weight": "Open Weight",
    "catch weight": "Catch Weight"
}

def normalize_weight_class(val):
    # clean first
    cleaned = clean_text(val)

    # if it matches exactly after cleaning
    if cleaned in mapping:
        return mapping[cleaned]
    
    # fuzzy keyword matching for tournament-style
    if "heavyweight" in cleaned and "light" not in cleaned:
        return "Heavyweight"
    elif "lightweight" in cleaned and "feather" not in cleaned:
        return "Lightweight"
    elif "middleweight" in cleaned:
        return "Middleweight"
    elif "featherweight" in cleaned:
        return "Featherweight"
    elif "bantamweight" in cleaned:
        # if it’s a women’s version
        if "women" in cleaned:
            return "Women's Bantamweight"
        return "Bantamweight"
    elif "flyweight" in cleaned:
        if "women" in cleaned:
            return "Women's Flyweight"
        return "Flyweight"
    elif "strawweight" in cleaned:
        return "Women's Strawweight"
    
    # default
    return "Open Weight"


In [28]:
df_events["weight_class"] = df_events["weight_class"].apply(normalize_weight_class)

In [29]:
df_events["weight_class"].value_counts()

weight_class
Lightweight             1381
Welterweight            1275
Middleweight            1084
Featherweight            833
Heavyweight              727
Bantamweight             720
Light Heavyweight        653
Flyweight                384
Women's Strawweight      346
Women's Flyweight        257
Women's Bantamweight     227
Open Weight              223
Catch Weight              72
Name: count, dtype: int64

Drop columns and reset index

In [30]:
df_events.drop(columns=['id', 'year'], axis=1, inplace=True)

df_events = df_events.reset_index(drop=True)

Normalize names for future matching

In [31]:
def normalize_name(name):

    if pd.isna(name):
        return ""
    
    # Convert to string if not already
    name = str(name)

    # Remove extra whitespace
    #name = name.strip()

    # Normalize unicode characters
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))

    # Convert to lower
    name = name.lower()

    # Remove apostrophes and replace with nothing or space
    #name = re.sub(r"'", "", name)

    # Remove periods and other punctuation
    name = re.sub(r'[^\w\s-]', '', name)

    # Normalize spaces (multiple spaces to single space)
    name = re.sub(r'\s+', ' ', name)

    words = name.split()

    return ' '.join(words).strip()

In [32]:
df_events[["fighter_red", "fighter_blue"]] = df_events[["fighter_red", "fighter_blue"]].map(normalize_name)

In [33]:
df_events

Unnamed: 0,event_date,event_name,fighter_red,fighter_blue,round,time,weight_class,win_method,winner,stance_red,stance_blue,knockdowns_red,knockdowns_blue,sig_attempts_red,sig_attempts_blue,sig_strikes_red,sig_strikes_blue,total_strikes_attempts_red,total_strikes_attempts_blue,total_strikes_red,total_strikes_blue,sub_attempts_red,sub_attempts_blue,takedowns_red,takedowns_blue,takedown_attempts_red,takedown_attempts_blue,control_time_red,control_time_blue,head_strikes_red,head_strikes_blue,head_attempts_red,head_attempts_blue,body_strikes_red,body_strikes_blue,body_attempts_red,body_attempts_blue,leg_strikes_red,leg_strikes_blue,leg_attempts_red,leg_attempts_blue,distance_red,distance_blue,distance_attempts_red,distance_attempts_blue,clinch_strikes_red,clinch_strikes_blue,clinch_attempts_red,clinch_attempts_blue,ground_strikes_red,ground_strikes_blue,ground_attempts_red,ground_attempts_blue
0,1994-03-11,UFC 2: No Way Out,jason delucia,scott baker,1,401,Open Weight,Submission,1,Southpaw,Orthodox,0,0,5,2,3,0,25,23,20,14,5,0,0,1,1,1,0,0,1,0,2,2,1,0,2,0,1,0,1,0,2,0,4,2,0,0,0,0,1,0,1,0
1,1994-03-11,UFC 2: No Way Out,royce gracie,remco pardoel,1,91,Open Weight,Submission,1,Southpaw,Southpaw,0,0,0,0,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1994-03-11,UFC 2: No Way Out,patrick smith,johnny rhodes,1,67,Open Weight,Submission,1,Orthodox,Orthodox,0,0,12,9,5,4,12,9,5,4,1,0,0,0,0,0,0,0,1,2,4,5,2,0,2,0,2,2,6,4,3,4,10,9,2,0,2,0,0,0,0,0
3,1994-03-11,UFC 2: No Way Out,frank hamaker,thaddeus luster,1,292,Open Weight,Submission,1,Orthodox,Orthodox,0,0,3,0,2,0,15,0,14,0,3,0,1,0,1,1,0,0,2,0,3,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,2,0
4,1994-03-11,UFC 2: No Way Out,patrick smith,ray wizard,1,58,Open Weight,Submission,1,Orthodox,Orthodox,0,0,1,1,1,1,1,2,1,2,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8177,2025-09-13,UFC Fight Night: Lopes vs. Silva,kelvin gastelum,dustin stoltzfus,3,300,Middleweight,Decision - Unanimous,1,Southpaw,Orthodox,1,0,117,146,57,64,118,151,58,69,0,0,0,1,0,6,23,120,30,25,81,93,16,25,24,34,11,14,12,19,56,55,116,134,1,4,1,5,0,5,0,7
8178,2025-09-13,UFC Fight Night: Lopes vs. Silva,diego lopes,jean silva,2,288,Featherweight,KO/TKO,1,Orthodox,Orthodox,1,0,135,91,74,43,154,91,86,43,0,0,3,0,4,0,160,3,63,26,118,67,1,10,1,17,10,7,16,7,31,42,65,90,1,1,2,1,42,0,68,0
8179,2025-09-13,UFC Fight Night: Lopes vs. Silva,rafa garcia,jared gordon,3,147,Lightweight,KO/TKO,1,Orthodox,Orthodox,1,0,162,186,91,65,179,201,107,80,2,0,2,1,3,3,121,9,74,55,144,171,7,8,7,13,10,2,11,2,70,65,137,186,10,0,11,0,11,0,14,0
8180,2025-09-13,UFC Fight Night: Lopes vs. Silva,rob font,david martinez,3,300,Bantamweight,Decision - Unanimous,0,Orthodox,Orthodox,0,0,158,155,49,62,158,210,49,115,0,0,0,0,5,2,59,33,40,33,138,115,7,10,17,16,2,19,3,24,49,62,158,155,0,0,0,0,0,0,0,0


In [34]:
df_events.to_csv('../data/notebooks/events_cleaned.csv', index=False)

Review Stats

In [35]:
# Stats data
conn = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    database=os.getenv("DB_NAME"),
    password=os.getenv("DB_PASSWORD")
    )

cursor = conn.cursor()

query = ("SELECT * FROM stats")

cursor.execute(query)

columns = [desc[0] for desc in cursor.description]

rows = cursor.fetchall()

df_stats = pd.DataFrame(rows, columns=columns)

cursor.close()
conn.close()

In [36]:
df_stats.tail()

Unnamed: 0,id,name,nickname,division,record,status,place_of_birth,trains_at,fighting_style,octagon_debut,age,height,weight,reach,leg_reach,wins,losses,draws,wins_by_knockout,first_round_finishes,win_by_dec,win_by_sub,sig_strikes_landed,sig_strikes_attempted,takedowns_landed,takedowns_attempted,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,sig_strikes_defense,takedown_defense,knockdown_avg,fight_time_avg,sig_strikes_standing,sig_strikes_clinch,sig_strikes_ground,head_target,body_target,leg_target
3075,3076,Zhu Kangjie,,Featherweight Division,3-0-0 (W-L-D),Active,China,,,"Nov. 23, 2024",29.0,,146.0,,,3,0,0,,,0.0,0.0,130.0,269.0,,1.0,3.0,1.0,0.33,0.0,70.0,51.0,0.67,15:00,99.0,15.0,16.0,78.0,38.0,14.0
3076,3077,Zu Anyanwu,"""8th Wonder""",Heavyweight Division,14-5-0 (W-L-D),Not Fighting,"Trenton, United States",,,"Jul. 11, 2017",41.0,73.0,263.5,77.0,41.0,14,5,0,,,4.0,2.0,57.0,133.0,,1.0,2.0,2.0,0.65,0.0,69.0,,0.65,11:32,48.0,3.0,6.0,46.0,11.0,0.0
3077,3078,Zubaira Tukhugov,"""Warrior""",Lightweight Division,20-6-1 (W-L-D),Not Fighting,"USSR, Russia","Tiger Muay Thai - Phuket, Thailand",Striker,"Feb. 15, 2014",32.0,68.0,157.5,68.0,38.5,20,6,1,7.0,7.0,12.0,1.0,357.0,894.0,7.0,41.0,3.0,3.0,2.13,0.0,64.0,100.0,0.4,12:31,302.0,24.0,31.0,312.0,36.0,9.0
3078,3079,Zviad Lazishvili,,Bantamweight Division,13-1-0 (W-L-D),Not Fighting,"Kobuleti, Georgia",Kaizen MMA,Freestyle,"Oct. 23, 2021",31.0,66.0,135.0,69.0,37.0,13,1,0,,,4.0,9.0,63.0,160.0,,6.0,4.0,6.0,0.0,0.0,51.0,,0.0,15:00,60.0,3.0,0.0,25.0,17.0,21.0
3079,3080,Zygimantas Ramaska,,Featherweight Division,9-3-0 (W-L-D),Not Fighting,Lithuania,Ukmergė Judo Club,Judo,"Aug. 24, 2024",28.0,71.0,145.5,73.0,,9,3,0,5.0,5.0,0.0,4.0,6.0,10.0,,,1.0,2.0,0.0,2.41,30.0,33.0,0.0,06:14,2.0,4.0,0.0,5.0,1.0,0.0


In [37]:
df_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080 entries, 0 to 3079
Data columns (total 40 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               3080 non-null   int64  
 1   name                             3080 non-null   object 
 2   nickname                         1828 non-null   object 
 3   division                         2968 non-null   object 
 4   record                           3040 non-null   object 
 5   status                           3050 non-null   object 
 6   place_of_birth                   2895 non-null   object 
 7   trains_at                        995 non-null    object 
 8   fighting_style                   1037 non-null   object 
 9   octagon_debut                    3080 non-null   object 
 10  age                              2820 non-null   float64
 11  height                           2769 non-null   float64
 12  weight              

In [38]:
df_stats.describe()

Unnamed: 0,id,age,height,weight,reach,leg_reach,wins,losses,draws,wins_by_knockout,first_round_finishes,win_by_dec,win_by_sub,sig_strikes_landed,sig_strikes_attempted,takedowns_landed,takedowns_attempted,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,sig_strikes_defense,takedown_defense,knockdown_avg,sig_strikes_standing,sig_strikes_clinch,sig_strikes_ground,head_target,body_target,leg_target
count,3080.0,2820.0,2769.0,2970.0,1930.0,1646.0,3080.0,3080.0,3080.0,1232.0,890.0,3028.0,3028.0,2784.0,2784.0,918.0,2488.0,2793.0,2793.0,2793.0,2793.0,2781.0,2345.0,2793.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0
mean,1540.5,36.05461,65.632358,157.542357,71.588342,39.903402,11.451948,4.369156,0.163312,6.299513,6.308989,2.483157,2.375165,245.544181,548.617098,7.191721,21.246785,3.230576,3.879341,1.476735,0.607211,52.197411,61.520682,0.403183,163.706407,31.088177,30.963342,142.369221,46.751982,36.636724
std,889.26374,7.115915,17.545453,51.085902,4.304879,2.615147,8.713976,3.75471,0.579181,4.056514,3.782921,3.460005,3.552049,313.049579,692.579118,10.248188,28.103981,2.030173,2.472364,1.71945,1.197645,10.574787,21.249445,2.027454,238.490468,47.903008,51.960561,198.880196,68.426881,56.739654
min,1.0,19.0,0.0,0.0,58.5,32.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,770.75,31.0,67.0,136.0,69.0,38.0,6.0,1.0,0.0,3.0,4.0,0.0,0.0,44.0,106.0,2.0,4.0,2.0,3.0,0.18,0.0,46.0,48.0,0.0,17.0,3.0,1.0,16.0,5.0,3.0
50%,1540.5,35.0,70.0,156.0,72.0,40.0,11.0,4.0,0.0,6.0,6.0,1.0,1.0,127.5,281.0,3.0,11.0,3.0,3.0,1.01,0.24,53.0,63.0,0.0,73.0,13.0,10.0,66.0,21.0,16.0
75%,2310.25,40.0,73.0,185.0,74.5,41.5,16.0,6.0,0.0,8.0,8.0,4.0,4.0,325.0,731.0,8.0,26.0,4.0,5.0,2.13,0.81,59.0,75.0,0.42,215.25,38.25,38.25,188.0,61.0,46.0
max,3080.0,80.0,84.0,415.0,84.5,74.0,88.0,24.0,10.0,28.0,26.0,36.0,47.0,3655.0,7602.0,90.0,281.0,41.0,53.0,24.11,21.95,100.0,100.0,56.25,3222.0,503.0,523.0,2363.0,893.0,508.0


In [39]:
df_stats.isnull().sum()

id                                    0
name                                  0
nickname                           1252
division                            112
record                               40
status                               30
place_of_birth                      185
trains_at                          2085
fighting_style                     2043
octagon_debut                         0
age                                 260
height                              311
weight                              110
reach                              1150
leg_reach                          1434
wins                                  0
losses                                0
draws                                 0
wins_by_knockout                   1848
first_round_finishes               2190
win_by_dec                           52
win_by_sub                           52
sig_strikes_landed                  296
sig_strikes_attempted               296
takedowns_landed                   2162


Find Duplicates Names and Drop

In [40]:
df_stats[df_stats.duplicated('name', keep=False) == True]

Unnamed: 0,id,name,nickname,division,record,status,place_of_birth,trains_at,fighting_style,octagon_debut,age,height,weight,reach,leg_reach,wins,losses,draws,wins_by_knockout,first_round_finishes,win_by_dec,win_by_sub,sig_strikes_landed,sig_strikes_attempted,takedowns_landed,takedowns_attempted,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,sig_strikes_defense,takedown_defense,knockdown_avg,fight_time_avg,sig_strikes_standing,sig_strikes_clinch,sig_strikes_ground,head_target,body_target,leg_target
388,389,Bruno Silva,"""Bulldog""",Flyweight Division,14-7-2 (W-L-D),Active,"Piracicaba, Brazil",American Top Team,Grappler,"Sep. 7, 2019",35.0,64.0,125.0,65.0,35.0,14,7,2,6.0,4.0,4.0,4.0,353.0,713.0,2.0,46.0,4.0,5.0,1.79,0.16,50.0,60.0,0.98,10:15,287.0,24.0,42.0,196.0,80.0,77.0
389,390,Bruno Silva,"""Blindado""",Middleweight Division,23-13-0 (W-L-D),Not Fighting,Brazil,Evolucao Thai - Curitiba,Striker,"Jun. 19, 2021",35.0,72.0,187.0,74.0,42.0,23,13,0,20.0,14.0,3.0,0.0,376.0,783.0,2.0,22.0,4.0,5.0,0.77,0.0,42.0,74.0,0.31,08:51,274.0,44.0,58.0,284.0,58.0,34.0
440,441,Casey Kenney,,Bantamweight Division,16-4-1 (W-L-D),Active,"Portland, United States",,MMA,"Jul. 18, 2017",32.0,67.0,136.0,70.0,36.0,16,4,1,2.0,,9.0,5.0,670.0,1570.0,2.0,28.0,5.0,5.0,1.09,0.43,58.0,61.0,0.0,13:48,598.0,54.0,18.0,361.0,168.0,141.0
441,442,Casey Kenney,,,,Not Fighting,,,,"Sep. 27, 2025",,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
1374,1375,Joey Gomez,"""KO King""",Bantamweight Division,6-2-0 (W-L-D),Not Fighting,"Fairfax, United States",Team Link Hooksett,,"Jan. 18, 2016",36.0,68.0,135.0,73.0,40.0,6,2,0,,,0.0,0.0,59.0,207.0,,1.0,2.0,4.0,0.62,0.0,55.0,50.0,0.62,12:07,47.0,12.0,0.0,36.0,9.0,14.0
1375,1376,Joey Gomez,,Lightweight Division,7-0-0 (W-L-D),Not Fighting,"Reno, United States",,,"Sep. 27, 2025",33.0,,,,,7,0,0,,,0.0,0.0,56.0,113.0,,7.0,4.0,3.0,2.0,0.0,51.0,,0.0,15:00,47.0,0.0,9.0,54.0,2.0,0.0
2407,2408,Richie Vaculik,"""Vas""",Flyweight Division,10-5-0 (W-L-D),Not Fighting,"Sydney, Australia",,,"Dec. 7, 2013",40.0,66.0,125.0,69.0,41.0,10,5,0,,,1.0,6.0,166.0,355.0,,20.0,4.0,4.0,3.0,1.0,57.0,32.0,0.0,11:15,132.0,23.0,11.0,107.0,31.0,28.0
2408,2409,Richie Vaculik,,,,Not Fighting,Australia,,,"Sep. 27, 2025",,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,


Drop duplicates that have the most NaN values in the row

In [41]:
df_stats = df_stats.assign(nan_count=df_stats.isnull().sum(axis=1)) \
            .sort_values(['name', 'nan_count']) \
            .drop_duplicates('name', keep='first') \
            .drop('nan_count', axis=1)

In [42]:
df_stats[df_stats['name'] == 'Bruno Silva']

Unnamed: 0,id,name,nickname,division,record,status,place_of_birth,trains_at,fighting_style,octagon_debut,age,height,weight,reach,leg_reach,wins,losses,draws,wins_by_knockout,first_round_finishes,win_by_dec,win_by_sub,sig_strikes_landed,sig_strikes_attempted,takedowns_landed,takedowns_attempted,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,sig_strikes_defense,takedown_defense,knockdown_avg,fight_time_avg,sig_strikes_standing,sig_strikes_clinch,sig_strikes_ground,head_target,body_target,leg_target
388,389,Bruno Silva,"""Bulldog""",Flyweight Division,14-7-2 (W-L-D),Active,"Piracicaba, Brazil",American Top Team,Grappler,"Sep. 7, 2019",35.0,64.0,125.0,65.0,35.0,14,7,2,6.0,4.0,4.0,4.0,353.0,713.0,2.0,46.0,4.0,5.0,1.79,0.16,50.0,60.0,0.98,10:15,287.0,24.0,42.0,196.0,80.0,77.0


Find missing values

In [43]:
missing_df = pd.DataFrame({
    'missing_count': df_stats.isnull().sum(),
    'missing_precent': round((df_stats.isnull().sum() / len(df_stats)) * 100, 2)
}).sort_values(by='missing_precent', ascending=False)

missing_df

Unnamed: 0,missing_count,missing_precent
first_round_finishes,2187,71.1
takedowns_landed,2159,70.19
trains_at,2082,67.69
fighting_style,2040,66.32
wins_by_knockout,1845,59.98
leg_reach,1431,46.52
nickname,1249,40.6
reach,1147,37.29
takedown_defense,732,23.8
takedowns_attempted,590,19.18


Most of the null values are from new fighters who do not have any stats yet or very old fighters

In [44]:
df_stats[df_stats[['sig_strikes_standing', 'sig_strikes_clinch', 'sig_strikes_ground', 'head_target', 'body_target', 'leg_target']].isnull().all(axis=1)]

Unnamed: 0,id,name,nickname,division,record,status,place_of_birth,trains_at,fighting_style,octagon_debut,age,height,weight,reach,leg_reach,wins,losses,draws,wins_by_knockout,first_round_finishes,win_by_dec,win_by_sub,sig_strikes_landed,sig_strikes_attempted,takedowns_landed,takedowns_attempted,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,sig_strikes_defense,takedown_defense,knockdown_avg,fight_time_avg,sig_strikes_standing,sig_strikes_clinch,sig_strikes_ground,head_target,body_target,leg_target
55,56,Alberta Cerra,,,,Not Fighting,United States,,,"Sep. 27, 2025",,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
138,139,Alvaro Ivan Lopez Rodrigues,,,,Not Fighting,Mexico,,,"Sep. 27, 2025",,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
149,150,Amir Aliakbari,,Heavyweight Division,0-0-0 (W-L-D),Not Fighting,,,,"Sep. 27, 2025",39.0,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
241,242,Artem Frolov,,Middleweight Division,0-0-0 (W-L-D),Not Fighting,,,,"Sep. 27, 2025",31.0,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
249,250,Asikeerbai Jinensibieke,,,,Active,"Xinjiang, China",,,"Jun. 10, 2022",34.0,,155.0,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
305,306,Bilyal Makhov,,,,Retired,"USSR, Russia",,,"Sep. 27, 2025",35.0,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
439,440,Carrese Archer,"""One Punch""",,,Not Fighting,,,,"Sep. 27, 2025",33.0,72.0,184.0,73.0,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
491,492,Chi Lewis-Parry,,Heavyweight Division,0-0-0 (W-L-D),Not Fighting,"Hitchin, United Kingdom",,,"Sep. 27, 2025",39.0,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
556,557,Cody Belisle,"""Bodacious""",,,Active,,,,"Sep. 27, 2025",30.0,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,
577,578,Coltin Cole,"""The Truth""",,,Active,,,,"Sep. 27, 2025",34.0,,217.0,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,


I will merge the two datasets togehter.
But first I will keep only a few features from the stats. Most of the stats are totals, such as wins by knockout, takedowns landed. 
If I use this data there will be data leakage and I will not get the results I want, because the model will already know total stats even of previous fights.

Thtat is why I will keep only the constant values and the averages such as octagon_debut, height, knowckdown_avg.

In [45]:
df_stats = df_stats[['name', 'octagon_debut', 'age', 'height', 'weight', 'reach',
    'leg_reach', 'sig_strikes_landed_per_minute', 'sig_strikes_absorbed_per_minute', 
    'takedowns_avg', 'submission_avg', 'knockdown_avg', 'fight_time_avg']]
df_stats

Unnamed: 0,name,octagon_debut,age,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
40,AJ Cunningham,"Mar. 2, 2024",30.0,70.0,136.0,71.0,41.0,4.0,6.0,0.00,0.52,0.00,09:34
1863,AJ Dobson,"Feb. 12, 2022",31.0,73.0,185.0,76.0,43.5,4.0,5.0,1.67,0.28,0.28,10:47
1683,AJ Fletcher,"Mar. 12, 2022",26.0,70.0,183.0,67.0,37.0,3.0,5.0,1.54,0.93,0.31,09:43
0,Aalon Cruz,"Jul. 30, 2019",33.0,72.0,155.0,78.0,42.0,8.0,9.0,0.00,0.00,0.85,05:54
1,Aaron Brink,"Nov. 17, 2000",48.0,75.0,231.0,,,3.0,6.0,0.00,0.00,0.00,01:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3075,Zhu Kangjie,"Nov. 23, 2024",29.0,,146.0,,,3.0,1.0,0.33,0.00,0.67,15:00
3076,Zu Anyanwu,"Jul. 11, 2017",41.0,73.0,263.5,77.0,41.0,2.0,2.0,0.65,0.00,0.65,11:32
3077,Zubaira Tukhugov,"Feb. 15, 2014",32.0,68.0,157.5,68.0,38.5,3.0,3.0,2.13,0.00,0.40,12:31
3078,Zviad Lazishvili,"Oct. 23, 2021",31.0,66.0,135.0,69.0,37.0,4.0,6.0,0.00,0.00,0.00,15:00


In [46]:
df_stats.isnull().sum()

name                                  0
octagon_debut                         0
age                                 258
height                              308
weight                              107
reach                              1147
leg_reach                          1431
sig_strikes_landed_per_minute       285
sig_strikes_absorbed_per_minute     285
takedowns_avg                       285
submission_avg                      285
knockdown_avg                       285
fight_time_avg                       50
dtype: int64

Weight and Height have some 0 values, this does not make sense as someone can not have a value of height be equal to 0, which will impact the imputation.
I will convert to null and after impute.

In [47]:
df_stats[df_stats['height'] == 0]
df_stats[df_stats['weight'] == 0]

Unnamed: 0,name,octagon_debut,age,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
31,Adrian Serrano,"Jun. 9, 2000",59.0,0.0,0.0,,,,,,,,00:00
34,Adriano Santos,"Oct. 16, 1998",,0.0,0.0,,,,,,,,00:00
56,Alberto Cerro Leon,"Mar. 11, 1994",,75.0,0.0,,,,,,,,00:00
79,Alex Andrade,"Jun. 9, 2000",49.0,0.0,0.0,,,0.0,3.0,0.00,0.75,0.0,20:00
86,Alex Hunter,"Jul. 27, 1997",,0.0,0.0,,,,,,,,00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,Valeri Ignatov,"Mar. 5, 1999",,70.0,0.0,,,,,,,,00:00
2987,Wallid Ismail,"Feb. 7, 1997",55.0,70.0,0.0,,,1.0,1.0,1.85,0.46,0.0,16:15
3000,Wes Albritton,"May. 30, 1997",,0.0,0.0,,,,,,,,00:00
3046,Yoshiki Takahashi,"Feb. 7, 1997",54.0,0.0,0.0,,,2.0,6.0,0.00,2.26,0.0,02:13


In [48]:
df_stats[['height', 'weight']] = df_stats[['height', 'weight']].replace(0, np.nan)

In [49]:
df_stats.isnull().sum()

name                                  0
octagon_debut                         0
age                                 258
height                              486
weight                              267
reach                              1147
leg_reach                          1431
sig_strikes_landed_per_minute       285
sig_strikes_absorbed_per_minute     285
takedowns_avg                       285
submission_avg                      285
knockdown_avg                       285
fight_time_avg                       50
dtype: int64

In [50]:
df_stats[df_stats['knockdown_avg'].isnull()]

Unnamed: 0,name,octagon_debut,age,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
9,Abdul Azeem Badakhshi,"Sep. 27, 2025",27.0,,,,,,,,,,00:00
23,Adam Hunter,"Sep. 27, 2025",39.0,73.0,185.5,73.0,40.0,,,,,,00:00
24,Adam Khaliev,"Sep. 27, 2025",39.0,70.0,170.0,,,,,,,,00:00
29,Adli Edwards,"Sep. 27, 2025",33.0,,,,,,,,,,00:00
31,Adrian Serrano,"Jun. 9, 2000",59.0,,,,,,,,,,00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,William Marcario,"Sep. 27, 2025",31.0,,,,,,,,,,
3014,Willian Souza,"Sep. 27, 2025",36.0,,146.0,,,,,,,,00:00
3027,Yamato Nishikawa,"Oct. 22, 2022",20.0,,,,,,,,,,00:00
769,Yuri Vaulin,"Jul. 27, 1997",,,,,,,,,,,00:00


Fill these stats with 0 because the fighters have just signed with the UFC or are too old fighters.

In [51]:
df_stats[['sig_strikes_landed_per_minute', 'sig_strikes_absorbed_per_minute','takedowns_avg',
        'submission_avg','knockdown_avg']] = df_stats[['sig_strikes_landed_per_minute', 
                                        'sig_strikes_absorbed_per_minute','takedowns_avg','submission_avg',
                                        'knockdown_avg']].fillna(0)

In [52]:
df_stats[df_stats['fight_time_avg'].isnull()].tail(5)

Unnamed: 0,name,octagon_debut,age,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
2801,Testy Test,"Sep. 27, 2025",,,,,34.0,0.0,0.0,0.0,0.0,0.0,
2674,Timo Feucht,"Sep. 27, 2025",27.0,,,,,0.0,0.0,0.0,0.0,0.0,
2840,Timothy Thomas,"Sep. 27, 2025",34.0,,214.0,,,0.0,0.0,0.0,0.0,0.0,
2963,Vineesh Subrahmanyan,"Sep. 27, 2025",,,,,,0.0,0.0,0.0,0.0,0.0,
3013,William Marcario,"Sep. 27, 2025",31.0,,,,,0.0,0.0,0.0,0.0,0.0,


In [53]:
df_stats[['fight_time_avg']] = df_stats[['fight_time_avg']].fillna('00:00')

Converting fight_time_avg to seconds

In [54]:
df_stats['fight_time_avg'] = pd.to_timedelta('00:' + df_stats['fight_time_avg']).dt.total_seconds().astype(int)

In [55]:
df_stats

Unnamed: 0,name,octagon_debut,age,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
40,AJ Cunningham,"Mar. 2, 2024",30.0,70.0,136.0,71.0,41.0,4.0,6.0,0.00,0.52,0.00,574
1863,AJ Dobson,"Feb. 12, 2022",31.0,73.0,185.0,76.0,43.5,4.0,5.0,1.67,0.28,0.28,647
1683,AJ Fletcher,"Mar. 12, 2022",26.0,70.0,183.0,67.0,37.0,3.0,5.0,1.54,0.93,0.31,583
0,Aalon Cruz,"Jul. 30, 2019",33.0,72.0,155.0,78.0,42.0,8.0,9.0,0.00,0.00,0.85,354
1,Aaron Brink,"Nov. 17, 2000",48.0,75.0,231.0,,,3.0,6.0,0.00,0.00,0.00,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3075,Zhu Kangjie,"Nov. 23, 2024",29.0,,146.0,,,3.0,1.0,0.33,0.00,0.67,900
3076,Zu Anyanwu,"Jul. 11, 2017",41.0,73.0,263.5,77.0,41.0,2.0,2.0,0.65,0.00,0.65,692
3077,Zubaira Tukhugov,"Feb. 15, 2014",32.0,68.0,157.5,68.0,38.5,3.0,3.0,2.13,0.00,0.40,751
3078,Zviad Lazishvili,"Oct. 23, 2021",31.0,66.0,135.0,69.0,37.0,4.0,6.0,0.00,0.00,0.00,900


Drop age column because the UFC website does not update it consistently and there are many wrong values especially for older fighters

In [56]:
df_stats.drop(columns='age', axis=1, inplace=True)

Impute missing values

In [57]:
print(df_stats.columns[df_stats.isnull().any()])
print("\n", len(df_stats.columns[df_stats.isnull().any()]))

Index(['height', 'weight', 'reach', 'leg_reach'], dtype='object')

 4


In [58]:
missing_cols = df_stats.columns[df_stats.isnull().any()]

imputer = KNNImputer(n_neighbors=5)
df_stats[missing_cols] = imputer.fit_transform(df_stats[missing_cols])

# Save as pickle
with open("../models/knn_imputer_stats.pkl", "wb") as f:
    pickle.dump(imputer, f)

In [59]:
df_stats.isnull().sum()

name                               0
octagon_debut                      0
height                             0
weight                             0
reach                              0
leg_reach                          0
sig_strikes_landed_per_minute      0
sig_strikes_absorbed_per_minute    0
takedowns_avg                      0
submission_avg                     0
knockdown_avg                      0
fight_time_avg                     0
dtype: int64

Some names have special characters. When I try and merge the events and stats dataframes these names will not be matched.
I need to normalize those names.

In [60]:
df_stats["name"] = df_stats["name"].map(normalize_name)

In [61]:
df_stats

Unnamed: 0,name,octagon_debut,height,weight,reach,leg_reach,sig_strikes_landed_per_minute,sig_strikes_absorbed_per_minute,takedowns_avg,submission_avg,knockdown_avg,fight_time_avg
40,aj cunningham,"Mar. 2, 2024",70.0,136.0,71.0,41.0,4.0,6.0,0.00,0.52,0.00,574
1863,aj dobson,"Feb. 12, 2022",73.0,185.0,76.0,43.5,4.0,5.0,1.67,0.28,0.28,647
1683,aj fletcher,"Mar. 12, 2022",70.0,183.0,67.0,37.0,3.0,5.0,1.54,0.93,0.31,583
0,aalon cruz,"Jul. 30, 2019",72.0,155.0,78.0,42.0,8.0,9.0,0.00,0.00,0.85,354
1,aaron brink,"Nov. 17, 2000",75.0,231.0,77.4,44.3,3.0,6.0,0.00,0.00,0.00,111
...,...,...,...,...,...,...,...,...,...,...,...,...
3075,zhu kangjie,"Nov. 23, 2024",68.0,146.0,70.8,38.8,3.0,1.0,0.33,0.00,0.67,900
3076,zu anyanwu,"Jul. 11, 2017",73.0,263.5,77.0,41.0,2.0,2.0,0.65,0.00,0.65,692
3077,zubaira tukhugov,"Feb. 15, 2014",68.0,157.5,68.0,38.5,3.0,3.0,2.13,0.00,0.40,751
3078,zviad lazishvili,"Oct. 23, 2021",66.0,135.0,69.0,37.0,4.0,6.0,0.00,0.00,0.00,900


In [62]:
df_stats.to_csv('../data/notebooks/stats_cleaned.csv', index = False)