In [1]:
import sqlite3
import os 
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'event_details.csv',
 'fighters.csv',
 'fighters_df.ipynb',
 'final_df.csv',
 'Final_DF.ipynb',
 'matches.ipynb',
 'match_df.csv',
 'scraper',
 'Soupscraper.ipynb',
 'Untitled.ipynb']

In [3]:
os.listdir('./scraper')

['.idea', 'FIGHTscraper', 'ufcfights.db', 'UFCscraper', 'venv']

In [4]:
conn = sqlite3.connect('./scraper/ufcfights.db')
curr = conn.cursor()

In [5]:
curr.execute("""SELECT name FROM sqlite_master WHERE type='table'""")
fight_tables = curr.fetchall()
print(fight_tables)

[('fighters_tb',), ('fight_tb',), ('fight_stats_tb',)]


In [6]:
fight_df = pd.read_sql_query("""SELECT * FROM fight_tb""",conn)
fight_stats = pd.read_sql_query("""SELECT * FROM fight_stats_tb""",conn)
#close the connection after use
conn.close()

## CLEANING FIGHT_DF
> NO NULL VALUES  
> CHANGE THE TIME FORMAT FROM MM:SS to SECONDS
> CHANGE W/L to 1/0 

In [7]:
fight_df.head()

Unnamed: 0,fight_name,fighter1_name,fighter2_name,fighter1_outcome,fighter2_outcome,weight_bout,method,num_rounds,details,time,referee
0,UFC Fight Night: Covington vs. Woodley,Tyson Nam,Jerome Rivera,W,L,Bantamweight Bout,KO/TKO,2,Punches to Head On Ground,0:34,Chris Tognoni
1,UFC 237: Namajunas vs. Andrade,Talita Bernardo,Viviane Araujo,L,W,Women's Bantamweight Bout,KO/TKO,3,Punch to Head At Distance,0:48,Fernando Portella
2,UFC 238: Cejudo vs. Moraes,Eddie Wineland,Grigory Popov,W,L,Bantamweight Bout,KO/TKO,2,Punch to Head At Distance,4:47,Kevin MacDonald
3,UFC 238: Cejudo vs. Moraes,Katlyn Chookagian,Joanne Calderwood,W,L,Women's Flyweight Bout,Decision - Unanimous,3,,5:00,Dan Miragliotta
4,UFC 238: Cejudo vs. Moraes,Bevon Lewis,Darren Stewart,L,W,Middleweight Bout,Decision - Unanimous,3,,5:00,Marc Goddard


In [8]:
fight_df.isnull().sum()

fight_name          0
fighter1_name       0
fighter2_name       0
fighter1_outcome    0
fighter2_outcome    0
weight_bout         0
method              0
num_rounds          0
details             0
time                0
referee             0
dtype: int64

In [9]:
"""Split the time column by : and then multiply the first index with by 60 
which is the minutes and add it to the second index which is the seconds to get total match time in seconds"""

fight_df['time'] = fight_df['time'].str.split(':').apply(lambda x: (int(x[0]) * 60) + int(x[1]))

In [10]:
"""Apply a map, if W then 1 if L then 0, and then convert the columns into int type"""

fight_df[['fighter1_outcome','fighter2_outcome']] =\
         fight_df[['fighter1_outcome','fighter2_outcome']].applymap(lambda x: 1 if x == 'W' else 0).astype('int')

In [11]:
fight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5749 entries, 0 to 5748
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   fight_name        5749 non-null   object
 1   fighter1_name     5749 non-null   object
 2   fighter2_name     5749 non-null   object
 3   fighter1_outcome  5749 non-null   int32 
 4   fighter2_outcome  5749 non-null   int32 
 5   weight_bout       5749 non-null   object
 6   method            5749 non-null   object
 7   num_rounds        5749 non-null   object
 8   details           5749 non-null   object
 9   time              5749 non-null   int64 
 10  referee           5749 non-null   object
dtypes: int32(2), int64(1), object(8)
memory usage: 449.3+ KB


## CLEANING FIGHT_STATS
> NO NULL VALUES  
> CONVERT ALL STRING/OBJECTS TO INT/FLOAT

In [12]:
fight_stats.head()

Unnamed: 0,fight_name,fighter1_name,fighter2_name,kd_1,kd_2,sig_str_1,sig_str_2,sig_str_pct_1,sig_str_pct_2,total_str_1,total_str_2,td_1,td_2,td_pct_1,td_pct_2,sub_att_1,sub_att_2,pass_1,pass_2,head_1,head_2,body_1,body_2,leg_1,leg_2,distance_1,distance_2,clinch_1,clinch_2,ground_1,ground_2
0,UFC Fight Night: Covington vs. Woodley,Tyson Nam,Jerome Rivera,1,0,36 of 72,30 of 80,50%,37%,36 of 72,31 of 81,0 of 0,0 of 0,0%,0%,0,0,0,0,27 of 58,9 of 45,7 of 12,2 of 11,2 of 2,19 of 24,18 of 47,29 of 79,3 of 3,1 of 1,15 of 22,0 of 0
1,UFC 237: Namajunas vs. Andrade,Talita Bernardo,Viviane Araujo,0,1,46 of 136,61 of 115,33%,53%,51 of 141,66 of 120,0 of 3,3 of 3,0%,100%,0,0,0,3,28 of 112,34 of 79,8 of 12,9 of 14,10 of 12,18 of 22,45 of 134,57 of 111,1 of 2,0 of 0,0 of 0,4 of 4
2,UFC 238: Cejudo vs. Moraes,Eddie Wineland,Grigory Popov,2,0,74 of 171,55 of 150,43%,36%,74 of 171,55 of 150,0 of 2,0 of 1,0%,0%,0,0,0,0,48 of 129,24 of 97,23 of 39,20 of 39,3 of 3,11 of 14,70 of 165,52 of 146,0 of 1,3 of 4,4 of 5,0 of 0
3,UFC 238: Cejudo vs. Moraes,Katlyn Chookagian,Joanne Calderwood,0,0,82 of 221,112 of 266,37%,42%,108 of 250,141 of 300,0 of 2,3 of 4,0%,75%,0,0,0,0,50 of 179,24 of 139,20 of 27,19 of 40,12 of 15,69 of 87,78 of 215,104 of 252,3 of 5,7 of 12,1 of 1,1 of 2
4,UFC 238: Cejudo vs. Moraes,Bevon Lewis,Darren Stewart,0,0,31 of 84,30 of 73,36%,41%,45 of 98,40 of 84,0 of 5,0 of 0,0%,0%,0,0,0,0,13 of 49,11 of 52,12 of 23,5 of 6,6 of 12,14 of 15,15 of 62,18 of 55,16 of 22,12 of 18,0 of 0,0 of 0


In [13]:
fight_stats.isnull().sum()

fight_name       0
fighter1_name    0
fighter2_name    0
kd_1             0
kd_2             0
sig_str_1        0
sig_str_2        0
sig_str_pct_1    0
sig_str_pct_2    0
total_str_1      0
total_str_2      0
td_1             0
td_2             0
td_pct_1         0
td_pct_2         0
sub_att_1        0
sub_att_2        0
pass_1           0
pass_2           0
head_1           0
head_2           0
body_1           0
body_2           0
leg_1            0
leg_2            0
distance_1       0
distance_2       0
clinch_1         0
clinch_2         0
ground_1         0
ground_2         0
dtype: int64

In [14]:
fight_stats.columns

Index(['fight_name', 'fighter1_name', 'fighter2_name', 'kd_1', 'kd_2',
       'sig_str_1', 'sig_str_2', 'sig_str_pct_1', 'sig_str_pct_2',
       'total_str_1', 'total_str_2', 'td_1', 'td_2', 'td_pct_1', 'td_pct_2',
       'sub_att_1', 'sub_att_2', 'pass_1', 'pass_2', 'head_1', 'head_2',
       'body_1', 'body_2', 'leg_1', 'leg_2', 'distance_1', 'distance_2',
       'clinch_1', 'clinch_2', 'ground_1', 'ground_2'],
      dtype='object')

In [15]:
""" Create function to extract digits from the columns and rename the columns _landed, _attempted
    Convert the new columns into int type.
""";

In [16]:
def landed_attempted(column):
    return fight_stats[column].str.extract("(?P<{}>\d+) [\w]+ (?P<{}>\d+)".format(column +'_landed',column +'_attempted'))

In [17]:
#stats to convert
matchstats = ['sig_str_1', 'sig_str_2','total_str_1', 'total_str_2',\
 'td_1', 'td_2','head_1', 'head_2','body_1', 'body_2',\
 'leg_1', 'leg_2', 'distance_1', 'distance_2',\
 'clinch_1', 'clinch_2', 'ground_1', 'ground_2']

#change the format from "x of y" to 2 columns x, y and then join it with the fight_stats df
for stats in matchstats:
    fight_stats = fight_stats.join(landed_attempted(stats),how ='right')
    
#drop the matchstats since we now have the cleaned value    
fight_stats.drop(matchstats,axis=1,inplace = True)


In [18]:
#change from percent to int
pct =['sig_str_pct_1', 'sig_str_pct_2', 'td_pct_1', 'td_pct_2']
for i in pct:
    fight_stats[i] = fight_stats[i].str.extract("(\d+)").astype('int')/100

In [19]:
#change the rest of the objects that are strings to int
fight_stats.iloc[:,9:]= fight_stats.iloc[:,9:].astype('int').copy()

In [20]:
#change the rest of objects that are strings to int
fight_stats.iloc[:,3:5] = fight_stats.iloc[:,3:5].astype('int').copy()

In [21]:
fight_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5749 entries, 0 to 5748
Data columns (total 49 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   fight_name             5749 non-null   object 
 1   fighter1_name          5749 non-null   object 
 2   fighter2_name          5749 non-null   object 
 3   kd_1                   5749 non-null   int32  
 4   kd_2                   5749 non-null   int32  
 5   sig_str_pct_1          5749 non-null   float64
 6   sig_str_pct_2          5749 non-null   float64
 7   td_pct_1               5749 non-null   float64
 8   td_pct_2               5749 non-null   float64
 9   sub_att_1              5749 non-null   int32  
 10  sub_att_2              5749 non-null   int32  
 11  pass_1                 5749 non-null   int32  
 12  pass_2                 5749 non-null   int32  
 13  sig_str_1_landed       5749 non-null   int32  
 14  sig_str_1_attempted    5749 non-null   int32  
 15  sig_

## JOIN 3 DATAFRAMES TOGETHER
> IMPORT ANOTHER FILE, WITH THE EVENT DATE AND LOCATION  
> MERGE THE DATAFRAMES TOGETHER

In [22]:
event_details = pd.read_csv("event_details.csv")

In [23]:
event_details.head()

Unnamed: 0,Event_Name,Date,City,State,Country
0,UFC Fight Night: Covington vs. Woodley,2020-09-19,Las Vegas,Nevada,USA
1,UFC Fight Night: Waterson vs. Hill,2020-09-12,Las Vegas,Nevada,USA
2,UFC Fight Night: Overeem vs. Sakai,2020-09-05,Las Vegas,Nevada,USA
3,UFC Fight Night: Smith vs. Rakic,2020-08-29,Las Vegas,Nevada,USA
4,UFC Fight Night: Munhoz vs. Edgar,2020-08-22,Las Vegas,Nevada,USA


In [24]:
#merge event_details with fight_stats on the fight_name/event_name column
fight_stats = fight_stats.merge(event_details,left_on = 'fight_name',right_on='Event_Name').copy()

In [25]:
print(fight_stats.shape)

(5749, 54)


In [26]:
#drop Event_name
fight_stats.drop('Event_Name',axis=1,inplace = True)

In [27]:
#reorder columns so date comes first
fight_stats = fight_stats[['Date', 'fight_name', 'fighter1_name', 'fighter2_name', 'kd_1', 'kd_2',
       'sig_str_pct_1', 'sig_str_pct_2', 'td_pct_1', 'td_pct_2', 'sub_att_1',
       'sub_att_2', 'pass_1', 'pass_2', 'sig_str_1_landed',
       'sig_str_1_attempted', 'sig_str_2_landed', 'sig_str_2_attempted',
       'total_str_1_landed', 'total_str_1_attempted', 'total_str_2_landed',
       'total_str_2_attempted', 'td_1_landed', 'td_1_attempted', 'td_2_landed',
       'td_2_attempted', 'head_1_landed', 'head_1_attempted', 'head_2_landed',
       'head_2_attempted', 'body_1_landed', 'body_1_attempted',
       'body_2_landed', 'body_2_attempted', 'leg_1_landed', 'leg_1_attempted',
       'leg_2_landed', 'leg_2_attempted', 'distance_1_landed',
       'distance_1_attempted', 'distance_2_landed', 'distance_2_attempted',
       'clinch_1_landed', 'clinch_1_attempted', 'clinch_2_landed',
       'clinch_2_attempted', 'ground_1_landed', 'ground_1_attempted',
       'ground_2_landed', 'ground_2_attempted', 'City', 'State', 'Country']];

In [28]:
#check to see if the number of records are equal
fight_stats.shape[0] == fight_df.shape[0]

True

In [29]:
#merge fight_stats and fight_df
match_df = fight_stats.merge(fight_df)

In [30]:
match_df.head()
print('All 3 dataframes are merged with new shape {}'.format(match_df.shape))

All 3 dataframes are merged with new shape (5751, 61)


In [31]:
match_df[match_df.duplicated()]

Unnamed: 0,Date,fight_name,fighter1_name,fighter2_name,kd_1,kd_2,sig_str_pct_1,sig_str_pct_2,td_pct_1,td_pct_2,sub_att_1,sub_att_2,pass_1,pass_2,sig_str_1_landed,sig_str_1_attempted,sig_str_2_landed,sig_str_2_attempted,total_str_1_landed,total_str_1_attempted,total_str_2_landed,total_str_2_attempted,td_1_landed,td_1_attempted,td_2_landed,td_2_attempted,head_1_landed,head_1_attempted,head_2_landed,head_2_attempted,body_1_landed,body_1_attempted,body_2_landed,body_2_attempted,leg_1_landed,leg_1_attempted,leg_2_landed,leg_2_attempted,distance_1_landed,distance_1_attempted,distance_2_landed,distance_2_attempted,clinch_1_landed,clinch_1_attempted,clinch_2_landed,clinch_2_attempted,ground_1_landed,ground_1_attempted,ground_2_landed,ground_2_attempted,City,State,Country,fighter1_outcome,fighter2_outcome,weight_bout,method,num_rounds,details,time,referee


In [35]:
#match_df.to_csv('match_df.csv',index= False)

In [36]:
match_df.shape

(5751, 61)