In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
all_cards = pd.read_csv('cleaned_datasets/allsets_cleaned.csv', index_col=0)

In [3]:
all_cards.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aerial Modification,W,5.0,{4}{W},,,,Enchant creature or Vehicle\nAs long as enchan...,uncommon,Enchant,,Enchantment,Aura
1,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
2,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
3,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
4,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"


In [4]:
all_cards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7125 entries, 0 to 7124
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        7125 non-null   object 
 1   colors      7125 non-null   object 
 2   manaValue   7125 non-null   float64
 3   manaCost    6617 non-null   object 
 4   power       3867 non-null   float64
 5   toughness   3877 non-null   float64
 6   loyalty     168 non-null    float64
 7   text        7025 non-null   object 
 8   rarity      7125 non-null   object 
 9   keywords    3399 non-null   object 
 10  supertypes  763 non-null    object 
 11  types       7125 non-null   object 
 12  subtypes    4636 non-null   object 
dtypes: float64(4), object(9)
memory usage: 779.3+ KB


NaN values in the `keywords` column are to be expected, as many cards do not have keywords.  We will replace these with empty strings.

In [5]:
all_cards['keywords'].fillna('', inplace=True)

I want to get a dataframe that includes only creatures, but first we have to deal with one edge case:  non-creature cards that contain the "Transform" keyword.  I do this by writing out a function that can extract the mana value for cards that are formatted in the following way: "{B}{B}{2}: Transform".  We find the Transform keyword in the text of the card, then grab the string that immediately precedes it, calculate the additional cost to transform the card and add it to the correct cell (which will, in all cases, be the row immediately following the card with the Transform keyword.  This also fixes the mana value for the back sides of modal cards with the Transform keyword that are both creatures.

In [6]:
all_cards['text'].fillna('', inplace=True)

In [7]:
all_cards.loc[all_cards.keywords.str.contains('Transform', case=False)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
1546,Extricator of Sin,W,3.0,{2}{W},0.0,3.0,,"When Extricator of Sin enters the battlefield,...",uncommon,"Delirium, Transform",,Creature,"Human, Cleric"
1557,Lone Rider,W,2.0,{1}{W},1.0,1.0,,"First strike, lifelink\nAt the beginning of th...",uncommon,"First strike, Lifelink, Transform",,Creature,"Human, Knight"
1579,Curious Homunculus,U,2.0,{1}{U},1.0,1.0,,{T}: Add {C}. Spend this mana only to cast an ...,uncommon,Transform,,Creature,Homunculus
1582,Docent of Perfection,U,5.0,{3}{U}{U},5.0,4.0,,Flying\nWhenever you cast an instant or sorcer...,rare,"Flying, Transform",,Creature,"Insect, Horror"
1590,Grizzled Angler,U,3.0,{2}{U},2.0,3.0,,{T}: Mill two cards. Then if there is a colorl...,uncommon,"Mill, Transform",,Creature,Human
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6809,Primal Wellspring,L,4.0,,,,,(Transforms from Primal Amulet.)\n{T}: Add one...,rare,Transform,,Land,
6814,Thaumatic Compass,C,2.0,{2},,,,"{3}, {T}: Search your library for a basic land...",rare,Transform,,Artifact,
6815,Spires of Orazca,L,2.0,,,,,(Transforms from Thaumatic Compass.)\n{T}: Add...,rare,Transform,,Land,
6816,Treasure Map,C,2.0,{2},,,,"{1}, {T}: Scry 1. Put a landmark counter on Tr...",rare,"Scry, Transform",,Artifact,


In [8]:
def backwards_extract (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text before the keyword
        if c <= 12:
            text_value = text[:c]
        else:
            text_value = text[c-12:c-2]

        # check to see that a curly brace exists before the keyword, confirming
        # that the text directly before it indicates an additional mana cost
        if '}' in text_value:
            text_value = text_value[text_value.find('{'):]

            # remove curly braces, Tap symbols, whitespace, colons
            text_value = re.sub('[,T:\s}{]', '', text_value)            

            # calculate value to use in dataframe 
            new_value = len(text_value) 
            if text_value[0].isdigit():
                new_value += int(text_value[0]) - 1

            # update the value of the NEXT card in the dataframe, the one that
            # our identified cards transform into
            df.iat[indexes[count] + 1, 2] += new_value
            
            # print a list of changes made so we can confirm we got it right
            print(df.iloc[indexes[count] +1, :4])

In [9]:
backwards_extract(all_cards, 'Transform')

name         Conduit of Emrakul
colors                        C
manaValue                   8.0
manaCost                    NaN
Name: 1655, dtype: object
name         Erupting Dreadwolf
colors                        C
manaValue                  10.0
manaCost                    NaN
Name: 1675, dtype: object
name         Dronepack Kindred
colors                       C
manaValue                 12.0
manaCost                   NaN
Name: 1682, dtype: object
name         Sinuous Predator
colors                      C
manaValue                 6.0
manaCost                  NaN
Name: 1698, dtype: object
name         Howling Chorus
colors                    C
manaValue               9.0
manaCost                NaN
Name: 1703, dtype: object
name         Fibrous Entangler
colors                       C
manaValue                 11.0
manaCost                   NaN
Name: 1710, dtype: object
name         Ulvenwald Abomination
colors                           C
manaValue                      9.0
man

We have a very similar problem with cards that have the "Disturb" mechanic, but the added difficulty that the mechanic is templated differently than the Transform mechanic (in that the keyword "Disturb" precedes the mana cost, where as the keyword "Transform" follows the mana cost).  So, below, I have a slightly tweaked function that grabs the mana value following the keyword, and we'll run that to update mana values for our Disturb creatures.

In [17]:
def extract_mana_value (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text after the keyword that lists the mana cost of casting 
        # the card, leaving out leading and trailing curly braces.
        text_value = text[c+9:text.rfind('}', c+8, c+20)]

        # remove the rest of the curly braces
        text_value = re.sub('[,T:\s}{]', '', text_value)

        # calculate value to use in dataframe 
        new_value = len(text_value) 
        if text_value[0].isdigit():
            new_value += int(text_value[0]) - 1

        # update the value in the dataframe
        df.iat[indexes[count] + 1, 2] = new_value
        
        # print a list of changes made so we can confirm we got it right
        print(df.iloc[indexes[count] +1, :4])

In [22]:
creatures.loc[creatures.text.str.contains('Disturb')].head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
1564,Beloved Beggar,W,2.0,{1}{W},0.0,4.0,Disturb {4}{W}{W} (You may cast this card from...,uncommon,"Disturb, Transform",,Creature,"Human, Peasant"
1573,Chaplain of Alms,W,1.0,{W},1.0,1.0,First strike\nWard {1} (Whenever this creature...,uncommon,"Disturb, First strike, Transform, Ward",,Creature,"Human, Cleric"
1584,Lunarch Veteran,W,1.0,{W},1.0,1.0,Whenever another creature enters the battlefie...,common,"Disturb, Transform",,Creature,"Human, Cleric"
1586,Mourning Patrol,W,3.0,{2}{W},2.0,3.0,Vigilance\nDisturb {3}{W} (You may cast this c...,common,"Disturb, Transform, Vigilance",,Creature,"Human, Soldier"
1595,Baithook Angler,U,2.0,{1}{U},2.0,1.0,Disturb {1}{U} (You may cast this card from yo...,common,"Disturb, Transform",,Creature,"Human, Peasant"


In [18]:
extract_mana_value(creatures, 'Disturb')

name         Generous Soul
colors                   W
manaValue              6.0
manaCost               NaN
Name: 1565, dtype: object
name         Chapel Shieldgeist
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 1574, dtype: object
name         Luminous Phantom
colors                      W
manaValue                 2.0
manaCost                  NaN
Name: 1585, dtype: object
name         Morning Apparition
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 1587, dtype: object
name         Hook-Haunt Drifter
colors                        U
manaValue                   2.0
manaCost                    NaN
Name: 1596, dtype: object
name         Ghostly Castigator
colors                        U
manaValue                   5.0
manaCost                    NaN
Name: 1599, dtype: object
name         Waildrifter
colors                 U
manaValue            5.0
manaCost             NaN
Name

In [10]:
creatures = all_cards.loc[all_cards.types.str.contains('Creature')].copy()
creatures.reset_index(drop=True, inplace=True)

In [11]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845 entries, 0 to 3844
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3845 non-null   object 
 1   colors      3845 non-null   object 
 2   manaValue   3845 non-null   float64
 3   manaCost    3698 non-null   object 
 4   power       3814 non-null   float64
 5   toughness   3824 non-null   float64
 6   loyalty     0 non-null      float64
 7   text        3845 non-null   object 
 8   rarity      3845 non-null   object 
 9   keywords    3845 non-null   object 
 10  supertypes  457 non-null    object 
 11  types       3845 non-null   object 
 12  subtypes    3845 non-null   object 
dtypes: float64(4), object(9)
memory usage: 390.6+ KB


In [12]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


The column "Loyalty" is a value that is unique to planeswalkers, so we're going to drop that right away.

In [13]:
creatures.drop(['loyalty'], axis=1, inplace=True)

In [14]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


In [15]:
len(creatures[creatures.manaCost.isna()])

147

There are 147 rows with NaN values for manaCost.  We have zero rows that have NaN values in manaValue, though, so these are almost definitely the backs of modal cards that transform through one game mechanic or another.  The amount of resources spent to cast the card is accurately reflected in the manaValue column, though the conditions for flipping the card are not going to be considered at this stage of our investigation.  Still, we'll do the same as we did above and check that this is true of about 10% of the data before dropping the column to make sure we didn't inadvertently introduce any errors.

In [16]:
creatures[creatures.manaCost.isna()].sample(15)

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
2272,Reflection of Kiki-Jiki,R,3.0,,2.0,2.0,"{1}, {T}: Create a token that's a copy of anot...",rare,,,"Enchantment, Creature","Goblin, Shaman"
1681,Ashmouth Dragon,R,2.0,,4.0,4.0,Flying\nWhenever you cast an instant or sorcer...,rare,Flying,,Creature,Dragon
2971,One of the Pack,G,4.0,,5.0,6.0,"At the beginning of each upkeep, if a player c...",common,Transform,,Creature,Werewolf
2210,Portrait of Michiko,W,2.0,,0.0,0.0,Portrait of Michiko gets +1/+1 for each artifa...,uncommon,,,"Enchantment, Creature","Human, Noble"
3379,Howlpack Avenger,R,4.0,,4.0,4.0,Whenever a permanent you control is dealt dama...,rare,Nightbound,,Creature,Werewolf
1567,Dauntless Avenger,W,3.0,,3.0,2.0,"Whenever Dauntless Avenger attacks, return tar...",uncommon,,,Creature,"Human, Soldier"
2937,Skin Shedder,R,1.0,,3.0,4.0,,uncommon,,,Creature,"Insect, Horror"
2968,Werewolf of Ancient Hunger,G,5.0,,,,"Vigilance, trample\nWerewolf of Ancient Hunger...",rare,"Trample, Transform, Vigilance",,Creature,Werewolf
862,Abolisher of Bloodlines,C,5.0,,6.0,5.0,Flying\nWhen this creature transforms into Abo...,rare,"Flying, Transform",,Creature,"Eldrazi, Vampire"
3391,Dire-Strain Anarchist,R,5.0,,5.0,5.0,"Menace, haste\nWhenever Dire-Strain Anarchist ...",mythic,"Haste, Menace, Nightbound",,Creature,Werewolf


In checking to make sure I didn't introduce any errors, I realized that the listed Mana Values for cards with the "Disturb" keyword were not reflective of the cost to cast their disturb mode, so I went ahead and built out a function that will let us extract the Disturb cost from the card text and replace that in the mana value field.

In [None]:
creatures.iat[25, 0]

In [20]:
creatures.loc[creatures.keywords.str.contains('Disturb')].head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
1564,Beloved Beggar,W,2.0,{1}{W},0.0,4.0,Disturb {4}{W}{W} (You may cast this card from...,uncommon,"Disturb, Transform",,Creature,"Human, Peasant"
1573,Chaplain of Alms,W,1.0,{W},1.0,1.0,First strike\nWard {1} (Whenever this creature...,uncommon,"Disturb, First strike, Transform, Ward",,Creature,"Human, Cleric"
1584,Lunarch Veteran,W,1.0,{W},1.0,1.0,Whenever another creature enters the battlefie...,common,"Disturb, Transform",,Creature,"Human, Cleric"
1586,Mourning Patrol,W,3.0,{2}{W},2.0,3.0,Vigilance\nDisturb {3}{W} (You may cast this c...,common,"Disturb, Transform, Vigilance",,Creature,"Human, Soldier"
1595,Baithook Angler,U,2.0,{1}{U},2.0,1.0,Disturb {1}{U} (You may cast this card from yo...,common,"Disturb, Transform",,Creature,"Human, Peasant"


We have two exceptions with cards that come into play as lands and then can later be flipped into creatures.  I'll go ahead and manually set reasonable manaValues for those two, since it's such a small number.

In [None]:
creatures[creatures.manaCost.isna()].query('manaValue == 0')

In [None]:
creatures.iat[2989, 2] = 5
creatures.iat[1750, 2] = 3

In [None]:
creatures.info()

In [None]:
creatures[creatures.manaCost.isna()].sample(15)

At this point, we should have dealt with most or all of the incorrectly assigned manaValue creatures, though we are not accounting for non-mana costs.  Doing so would be significantly more complicated and goes beyond the scope of the current investigation, so I'm happy to move on to other things that need to be corrected.  At this point, we should not need the manaCost column any longer, so I'll drop it.

In [None]:
creatures.drop(columns='manaCost', inplace=True)

In [None]:
creatures.head()

There is a small subset of creatures whose power and/or toughness is listed as '*'.  This means that those values are variable and dependent on game-state conditions.  We've only got 34 of these creatures in our dataset of almost 4,000, comprising less than 1% of the creatures.  Rather than spend an inordinate amount of time accounting for these edge cases, I choose to drop them from the dataframe.

In [None]:
len(creatures[creatures.power.isnull() | creatures.toughness.isnull()])

In [None]:
creatures.drop(creatures[creatures.power.isnull() | creatures.toughness.isnull()].index, inplace=True)
creatures.reset_index(drop=True, inplace=True)

In [None]:
creatures.info()

In [None]:
creatures = pd.concat([creatures, creatures.keywords.str.get_dummies(sep=", ")], axis=1)

In [None]:
creatures.head()

In [None]:
creatures.shape

In [None]:
creatures.columns

In [None]:
creatures.types.unique()

In [None]:
creatures = pd.concat([creatures, creatures.types.str.get_dummies(sep=", ")], axis=1)

In [None]:
creatures.head()

In [None]:
creatures.shape

Note to self:  Creature column should be 100% 1.  Probably need to drop it.

In [None]:
#creatures.supertypes.unique()

In [None]:
#creatures = pd.concat([creatures, creatures.supertypes.str.get_dummies(sep=", ")], axis=1)

In [None]:
#creatures.shape

In [None]:
#creatures.subtypes.unique()

In [None]:
#creatures = pd.concat([creatures, creatures.subtypes.str.get_dummies(sep=", ")], axis=1)

In [None]:
#creatures.shape

In [None]:
#creatures.head()

In [None]:
#creatures = pd.concat([creatures, creatures.colors.str.get_dummies(sep=", ")], axis=1)

In [None]:
#creatures.shape

In [None]:
#creatures.colors.unique()

In [None]:
creatures_transformed = creatures.drop(columns=['text', 'rarity', 'keywords', 'supertypes', 'types', 'subtypes'])

In [None]:
creatures_transformed.head()

In [None]:
creatures_transformed.info(verbose=True)

In [None]:
creatures_transformed.describe()

This is looking like we could actually start to use it for some machine learning analysis, so I'm going to export it to a CSV and pick up in a new file.

In [None]:
creatures_transformed.to_csv('cleaned_datasets/creatures_transformed.csv')