In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
all_cards = pd.read_csv('cleaned_datasets/allsets_cleaned.csv', index_col=0)

In [3]:
all_cards.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aerial Modification,W,5.0,{4}{W},,,,Enchant creature or Vehicle\nAs long as enchan...,uncommon,Enchant,,Enchantment,Aura
1,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
2,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
3,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
4,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"


In [4]:
all_cards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7125 entries, 0 to 7124
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        7125 non-null   object 
 1   colors      7125 non-null   object 
 2   manaValue   7125 non-null   float64
 3   manaCost    6617 non-null   object 
 4   power       3867 non-null   float64
 5   toughness   3877 non-null   float64
 6   loyalty     168 non-null    float64
 7   text        7025 non-null   object 
 8   rarity      7125 non-null   object 
 9   keywords    3399 non-null   object 
 10  supertypes  763 non-null    object 
 11  types       7125 non-null   object 
 12  subtypes    4636 non-null   object 
dtypes: float64(4), object(9)
memory usage: 779.3+ KB


NaN values in the `keywords` column are to be expected, as many cards do not have keywords.  We will replace these with empty strings.

In [5]:
all_cards['keywords'].fillna('', inplace=True)

I want to get a dataframe that includes only creatures, but first we have to deal with some edge cases:  first, non-creature cards that contain the "Transform" keyword.  I do this by writing out a function that can extract the mana value for cards that are formatted in the following way: "{B}{B}{2}: Transform".  We find the Transform keyword in the text of the card, then grab the string that immediately precedes it, calculate the additional cost to transform the card and add it to the correct cell (which will, in all cases, be the row immediately following the card with the Transform keyword.  This also fixes the mana value for the back sides of modal cards with the Transform keyword that are both creatures.)

In [6]:
all_cards['text'].fillna('', inplace=True)

In [7]:
all_cards.loc[all_cards.keywords.str.contains('Transform', case=False)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
1546,Extricator of Sin,W,3.0,{2}{W},0.0,3.0,,"When Extricator of Sin enters the battlefield,...",uncommon,"Delirium, Transform",,Creature,"Human, Cleric"
1557,Lone Rider,W,2.0,{1}{W},1.0,1.0,,"First strike, lifelink\nAt the beginning of th...",uncommon,"First strike, Lifelink, Transform",,Creature,"Human, Knight"
1579,Curious Homunculus,U,2.0,{1}{U},1.0,1.0,,{T}: Add {C}. Spend this mana only to cast an ...,uncommon,Transform,,Creature,Homunculus
1582,Docent of Perfection,U,5.0,{3}{U}{U},5.0,4.0,,Flying\nWhenever you cast an instant or sorcer...,rare,"Flying, Transform",,Creature,"Insect, Horror"
1590,Grizzled Angler,U,3.0,{2}{U},2.0,3.0,,{T}: Mill two cards. Then if there is a colorl...,uncommon,"Mill, Transform",,Creature,Human
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6809,Primal Wellspring,L,4.0,,,,,(Transforms from Primal Amulet.)\n{T}: Add one...,rare,Transform,,Land,
6814,Thaumatic Compass,C,2.0,{2},,,,"{3}, {T}: Search your library for a basic land...",rare,Transform,,Artifact,
6815,Spires of Orazca,L,2.0,,,,,(Transforms from Thaumatic Compass.)\n{T}: Add...,rare,Transform,,Land,
6816,Treasure Map,C,2.0,{2},,,,"{1}, {T}: Scry 1. Put a landmark counter on Tr...",rare,"Scry, Transform",,Artifact,


In [8]:
def backwards_extract (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text before the keyword
        if c <= 12:
            text_value = text[:c]
        else:
            text_value = text[c-12:c-2]

        # check to see that a curly brace exists before the keyword, confirming
        # that the text directly before it indicates an additional mana cost
        if '}' in text_value:
            text_value = text_value[text_value.find('{'):]

            # remove curly braces, Tap symbols, whitespace, colons
            text_value = re.sub('[,T:\s}{]', '', text_value)            

            # calculate value to use in dataframe 
            new_value = len(text_value) 
            if text_value[0].isdigit():
                new_value += int(text_value[0]) - 1

            # update the value of the NEXT card in the dataframe, the one that
            # our identified cards transform into
            df.iat[indexes[count] + 1, 2] += new_value
            
            # print a list of changes made so we can confirm we got it right
            print(df.iloc[indexes[count] +1, :4])

In [9]:
backwards_extract(all_cards, 'Transform')

name         Conduit of Emrakul
colors                        C
manaValue                   8.0
manaCost                    NaN
Name: 1655, dtype: object
name         Erupting Dreadwolf
colors                        C
manaValue                  10.0
manaCost                    NaN
Name: 1675, dtype: object
name         Dronepack Kindred
colors                       C
manaValue                 12.0
manaCost                   NaN
Name: 1682, dtype: object
name         Sinuous Predator
colors                      C
manaValue                 6.0
manaCost                  NaN
Name: 1698, dtype: object
name         Howling Chorus
colors                    C
manaValue               9.0
manaCost                NaN
Name: 1703, dtype: object
name         Fibrous Entangler
colors                       C
manaValue                 11.0
manaCost                   NaN
Name: 1710, dtype: object
name         Ulvenwald Abomination
colors                           C
manaValue                      9.0
man

We have a very similar problem with cards that have the "Disturb" mechanic, but the added difficulty that the mechanic is templated differently than the Transform mechanic (in that the keyword "Disturb" precedes the mana cost, where as the keyword "Transform" follows the mana cost).  So, below, I have a slightly tweaked function that grabs the mana value following the keyword, and we'll run that to update mana values for our Disturb creatures.

In [10]:
def extract_mana_value (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text after the keyword that lists the mana cost of casting 
        # the card, leaving out leading and trailing curly braces.
        text_value = text[c+9:text.rfind('}', c+8, c+20)]

        # remove the rest of the curly braces
        text_value = re.sub('[,T:\s}{]', '', text_value)

        # calculate value to use in dataframe 
        new_value = len(text_value) 
        if text_value[0].isdigit():
            new_value += int(text_value[0]) - 1

        # update the value in the dataframe
        df.iat[indexes[count] + 1, 2] = new_value
        
        # print a list of changes made so we can confirm we got it right
        print(df.iloc[indexes[count] +1, :4])

In [11]:
all_cards.loc[all_cards.text.str.contains('Disturb')].head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
3008,Beloved Beggar,W,2.0,{1}{W},0.0,4.0,,Disturb {4}{W}{W} (You may cast this card from...,uncommon,"Disturb, Transform",,Creature,"Human, Peasant"
3021,Chaplain of Alms,W,1.0,{W},1.0,1.0,,First strike\nWard {1} (Whenever this creature...,uncommon,"Disturb, First strike, Transform, Ward",,Creature,"Human, Cleric"
3037,Lunarch Veteran,W,1.0,{W},1.0,1.0,,Whenever another creature enters the battlefie...,common,"Disturb, Transform",,Creature,"Human, Cleric"
3039,Mourning Patrol,W,3.0,{2}{W},2.0,3.0,,Vigilance\nDisturb {3}{W} (You may cast this c...,common,"Disturb, Transform, Vigilance",,Creature,"Human, Soldier"
3054,Baithook Angler,U,2.0,{1}{U},2.0,1.0,,Disturb {1}{U} (You may cast this card from yo...,common,"Disturb, Transform",,Creature,"Human, Peasant"


In [12]:
extract_mana_value(all_cards, 'Disturb')

name         Generous Soul
colors                   W
manaValue              6.0
manaCost               NaN
Name: 3009, dtype: object
name         Chapel Shieldgeist
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 3022, dtype: object
name         Luminous Phantom
colors                      W
manaValue                 2.0
manaCost                  NaN
Name: 3038, dtype: object
name         Morning Apparition
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 3040, dtype: object
name         Hook-Haunt Drifter
colors                        U
manaValue                   2.0
manaCost                    NaN
Name: 3055, dtype: object
name         Ghostly Castigator
colors                        U
manaValue                   5.0
manaCost                    NaN
Name: 3059, dtype: object
name         Waildrifter
colors                 U
manaValue            5.0
manaCost             NaN
Name

With those adjustments made, we should be able to now grab a new dataframe that has only the creatures in it.

In [13]:
creatures = all_cards.loc[all_cards.types.str.contains('Creature')].copy()
creatures.reset_index(drop=True, inplace=True)

In [14]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845 entries, 0 to 3844
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3845 non-null   object 
 1   colors      3845 non-null   object 
 2   manaValue   3845 non-null   float64
 3   manaCost    3698 non-null   object 
 4   power       3814 non-null   float64
 5   toughness   3824 non-null   float64
 6   loyalty     0 non-null      float64
 7   text        3845 non-null   object 
 8   rarity      3845 non-null   object 
 9   keywords    3845 non-null   object 
 10  supertypes  457 non-null    object 
 11  types       3845 non-null   object 
 12  subtypes    3845 non-null   object 
dtypes: float64(4), object(9)
memory usage: 390.6+ KB


In [15]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


The column "Loyalty" is a value that is unique to planeswalkers, so we're going to drop that right away.

In [16]:
creatures.drop(['loyalty'], axis=1, inplace=True)

In [17]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


I then look at creatures that have a mana value equal to 0.  The only one that should actually have a listed mana value of zero is the Ornithopter.  Most of the rest of them have variable casting costs, represented by the X.  We'll need to deal with them.

In [18]:
creatures[creatures.manaValue.eq(0)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
90,Ornithopter,C,0.0,{0},0.0,2.0,Flying,uncommon,Flying,,"Artifact, Creature",Thopter
96,Walking Ballista,C,0.0,{X}{X},0.0,0.0,Walking Ballista enters the battlefield with X...,rare,,,"Artifact, Creature",Construct
377,Endless One,C,0.0,{X},0.0,0.0,Endless One enters the battlefield with X +1/+...,rare,,,Creature,Eldrazi
784,Stonecoil Serpent,C,0.0,{X},0.0,0.0,"Reach, trample, protection from multicolored\n...",rare,"Protection, Reach, Trample",,"Artifact, Creature",Snake
1060,Chamber Sentry,C,0.0,{X},0.0,0.0,Chamber Sentry enters the battlefield with a +...,rare,,,"Artifact, Creature",Construct
1750,Creeping Inn,B,0.0,,3.0,7.0,"Whenever Creeping Inn attacks, you may exile a...",mythic,,,"Artifact, Creature","Horror, Construct"
2989,"Ormendahl, Profane Prince",B,0.0,,9.0,7.0,"Flying, lifelink, indestructible, haste",rare,"Flying, Haste, Indestructible, Lifelink",Legendary,Creature,Demon
3458,Ugin's Conjurant,C,0.0,{X},0.0,0.0,Ugin's Conjurant enters the battlefield with X...,uncommon,,,Creature,"Spirit, Monk"


In [19]:
creatures.manaCost.fillna('', inplace=True)

By searching for cards that have an 'X' in their manaCost, we can identify cards with a variable cost to cast.  There are a small number of them, so we can safely drop them from the dataframe, rather than do a ton of work to handle this small amount of edge cases.

In [20]:
len(creatures[creatures.manaCost.str.contains('X')])

21

In [21]:
creatures.drop(creatures[creatures.manaCost.str.contains('X')].index, inplace=True)
creatures.reset_index(drop=True, inplace=True)

In [22]:
len(creatures[creatures.manaCost.str.contains('X')])

0

In [23]:
creatures[creatures.manaValue.eq(0)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
90,Ornithopter,C,0.0,{0},0.0,2.0,Flying,uncommon,Flying,,"Artifact, Creature",Thopter
1738,Creeping Inn,B,0.0,,3.0,7.0,"Whenever Creeping Inn attacks, you may exile a...",mythic,,,"Artifact, Creature","Horror, Construct"
2971,"Ormendahl, Profane Prince",B,0.0,,9.0,7.0,"Flying, lifelink, indestructible, haste",rare,"Flying, Haste, Indestructible, Lifelink",Legendary,Creature,Demon


We have two exceptions with cards that come into play as lands and then can later be flipped into creatures.  I'll go ahead and manually set reasonable manaValues for those two, since it's such a small number.

In [24]:
creatures.iat[1738, 2] = 5
creatures.iat[2971, 2] = 3

In [25]:
creatures[creatures.manaValue.eq(0)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
90,Ornithopter,C,0.0,{0},0.0,2.0,Flying,uncommon,Flying,,"Artifact, Creature",Thopter


There are 147 rows with no value for manaCost.  We have zero rows that have NaN values in manaValue, though, so these are almost definitely the backs of modal cards that transform through one game mechanic or another.  The amount of resources spent to cast the card is accurately reflected in the manaValue column, though additional conditions for flipping the card are not going to be considered at this stage of our investigation.  Still, we'll do the same as we did above and check that this is true of about 10% of the data before dropping the column to make sure we didn't inadvertently introduce any errors.

In [26]:
len(creatures[creatures.manaCost.eq('')])

147

In [27]:
creatures[creatures.manaCost.eq('')].sample(15)

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
873,Erupting Dreadwolf,C,10.0,,6.0,4.0,"Whenever Erupting Dreadwolf attacks, it deals ...",uncommon,,,Creature,"Eldrazi, Werewolf"
3342,Bloodbat Summoner,B,2.0,,3.0,3.0,Flying\nAt the beginning of combat on your tur...,rare,Flying,,Creature,"Vampire, Wizard"
1657,Fangblade Eviscerator,R,4.0,,4.0,5.0,{1}{R}: Fangblade Eviscerator gets +1/+0 and g...,uncommon,Nightbound,,Creature,Werewolf
2249,Echo of Death's Wail,B,2.0,,3.0,3.0,"Flying, haste\nWhen Echo of Death's Wail enter...",rare,"Flying, Haste",,"Enchantment, Creature",Spirit
1577,Morning Apparition,W,4.0,,2.0,1.0,"Flying, vigilance\nIf Morning Apparition would...",common,"Flying, Vigilance",,Creature,"Spirit, Soldier"
894,Howling Chorus,C,9.0,,3.0,5.0,Creatures with power less than Howling Chorus'...,uncommon,,,Creature,"Eldrazi, Werewolf"
889,Sinuous Predator,C,6.0,,4.0,4.0,Sinuous Predator can't be blocked by more than...,uncommon,,,Creature,"Eldrazi, Werewolf"
3375,Volt-Charged Berserker,R,2.0,,4.0,3.0,Volt-Charged Berserker can't block.,uncommon,,,Creature,"Human, Berserker"
2211,Living Breakthrough,U,4.0,,3.0,3.0,"Flying\nWhenever you cast a spell, your oppone...",rare,Flying,,"Enchantment, Creature",Moonfolk
2865,Unimpeded Trespasser,U,3.0,,3.0,3.0,Unimpeded Trespasser can't be blocked.,uncommon,,,Creature,Spirit


In [28]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3824 entries, 0 to 3823
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3824 non-null   object 
 1   colors      3824 non-null   object 
 2   manaValue   3824 non-null   float64
 3   manaCost    3824 non-null   object 
 4   power       3794 non-null   float64
 5   toughness   3804 non-null   float64
 6   text        3824 non-null   object 
 7   rarity      3824 non-null   object 
 8   keywords    3824 non-null   object 
 9   supertypes  452 non-null    object 
 10  types       3824 non-null   object 
 11  subtypes    3824 non-null   object 
dtypes: float64(3), object(9)
memory usage: 358.6+ KB


In [29]:
creatures[creatures.manaCost.eq('')].sample(15)

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
2923,Moonrise Intruder,R,1.0,,2.0,2.0,Menace (This creature can't be blocked except ...,uncommon,"Menace, Transform",,Creature,Werewolf
2898,Branded Howler,R,3.0,,4.0,4.0,"At the beginning of each upkeep, if a player c...",common,Transform,,Creature,Werewolf
3375,Volt-Charged Berserker,R,2.0,,4.0,3.0,Volt-Charged Berserker can't block.,uncommon,,,Creature,"Human, Berserker"
802,Extricator of Flesh,C,3.0,,3.0,5.0,"Eldrazi you control have vigilance.\n{2}, {T},...",uncommon,,,Creature,"Eldrazi, Horror"
1633,Inherited Fiend,B,2.0,,4.0,4.0,Flying\n{2}{B}: Exile target creature card fro...,uncommon,Flying,,Creature,Demon
2311,Vessel of the All-Consuming,"B, R",3.0,,3.0,3.0,Trample\nWhenever Vessel of the All-Consuming ...,mythic,Trample,,"Enchantment, Creature","Ogre, Shaman"
2283,Branch of Boseiju,G,4.0,,0.0,0.0,Reach\nBranch of Boseiju gets +1/+1 for each l...,uncommon,Reach,,"Enchantment, Creature",Plant
2896,Neck Breaker,R,3.0,,4.0,3.0,Attacking creatures you control get +1/+0 and ...,uncommon,Transform,,Creature,Werewolf
3408,Ulvenwald Behemoth,G,11.0,,8.0,8.0,"Trample, haste\nOther creatures you control ge...",rare,"Haste, Trample",,Creature,"Beast, Horror"
3412,Wedding Crasher,G,6.0,,6.0,5.0,Whenever Wedding Crasher or another Wolf or We...,uncommon,Nightbound,,Creature,Werewolf


At this point, we should have dealt with most or all of the incorrectly assigned manaValue creatures, though we are not accounting for non-mana costs.  Doing so would be significantly more complicated and goes beyond the scope of the current investigation, so I'm happy to move on to other things that need to be corrected.  At this point, we should not need the manaCost column any longer, so I'll drop it.

In [30]:
creatures.drop(columns='manaCost', inplace=True)

In [31]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


There is a small subset of creatures whose power and/or toughness is listed as an asterisk.  This means that those values are variable and dependent on game-state conditions.  We've only got 33 of these creatures in our dataset of almost 4,000, comprising less than 1% of the creatures.  Rather than spend an inordinate amount of time accounting for these edge cases, I choose to drop them from the dataframe.

In [32]:
creatures[creatures.power.isnull() | creatures.toughness.isnull()].head(10)

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,subtypes
351,Enigma Drake,"R, U",3.0,,4.0,Flying\nEnigma Drake's power is equal to the n...,uncommon,Flying,,Creature,Drake
443,Vile Aggregate,C,3.0,,5.0,Devoid (This card has no color.)\nVile Aggrega...,uncommon,"Devoid, Ingest, Trample",,Creature,"Eldrazi, Drone"
499,Veteran Warleader,"G, W",3.0,,,Veteran Warleader's power and toughness are ea...,rare,,,Creature,"Human, Soldier, Ally"
676,"Syr Elenora, the Discerning",U,5.0,,4.0,"Syr Elenora, the Discerning's power is equal t...",uncommon,,Legendary,Creature,"Human, Knight"
724,Beanstalk Giant,G,7.0,,,Beanstalk Giant's power and toughness are each...,uncommon,,,Creature,Giant
756,Wintermoor Commander,"B, W",2.0,2.0,,Deathtouch\nWintermoor Commander's toughness i...,uncommon,Deathtouch,,Creature,"Human, Knight"
775,Shambling Suit,C,3.0,,3.0,Shambling Suit's power is equal to the number ...,uncommon,,,"Artifact, Creature",Construct
1012,Crackling Drake,"R, U",4.0,,4.0,Flying\nCrackling Drake's power is equal to th...,uncommon,Flying,,Creature,Drake
1090,Apocalypse Demon,B,6.0,,,Flying\nApocalypse Demon's power and toughness...,rare,Flying,,Creature,Demon
1128,Majestic Myriarch,G,5.0,,,Majestic Myriarch's power and toughness are ea...,mythic,,,Creature,Chimera


In [33]:
len(creatures[creatures.power.isnull() | creatures.toughness.isnull()])

33

In [34]:
creatures.drop(creatures[creatures.power.isnull() | creatures.toughness.isnull()].index, inplace=True)
creatures.reset_index(drop=True, inplace=True)

In [35]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3791 entries, 0 to 3790
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3791 non-null   object 
 1   colors      3791 non-null   object 
 2   manaValue   3791 non-null   float64
 3   power       3791 non-null   float64
 4   toughness   3791 non-null   float64
 5   text        3791 non-null   object 
 6   rarity      3791 non-null   object 
 7   keywords    3791 non-null   object 
 8   supertypes  442 non-null    object 
 9   types       3791 non-null   object 
 10  subtypes    3791 non-null   object 
dtypes: float64(3), object(8)
memory usage: 325.9+ KB


In [None]:
# Creating a copy of our cleaned creature dataset before we do categorical transformations.
# creatures.to_csv('cleaned_datasets/creatures_only.csv')

As we saw in our EDA, certain keywords like flying, haste, trample and flash can be fairly indicitave of a creature's color identity.  We want to create binary columns to indicate the presence of these categorical features, so we'll use the `get_dummies` method to achieve this.

In [36]:
creatures = pd.concat([creatures, creatures.keywords.str.get_dummies(sep=", ")], axis=1)

In [37]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Support,Surge,Surveil,Training,Trample,Transform,Undergrowth,Venture,Vigilance,Ward
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,0,0,0,1,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,0,0


In [38]:
creatures.shape

(3791, 95)

In [39]:
creatures.columns

Index(['name', 'colors', 'manaValue', 'power', 'toughness', 'text', 'rarity',
       'keywords', 'supertypes', 'types', 'subtypes', 'Adamant', 'Adapt',
       'Afflict', 'Afterlife', 'Alliance', 'Amass', 'Ascend', 'Blitz', 'Boast',
       'Changeling', 'Channel', 'Cohort', 'Companion', 'Connive',
       'Constellation', 'Converge', 'Convoke', 'Coven', 'Cycling', 'Daybound',
       'Deathtouch', 'Defender', 'Delirium', 'Devoid', 'Disturb',
       'Double strike', 'Embalm', 'Emerge', 'Enrage', 'Equip', 'Escape',
       'Eternalize', 'Exert', 'Exploit', 'Explore', 'Fabricate', 'Fight',
       'First strike', 'Flash', 'Flying', 'Foretell', 'Haste', 'Hexproof',
       'Hexproof from', 'Improvise', 'Indestructible', 'Ingest', 'Investigate',
       'Kicker', 'Landfall', 'Learn', 'Lifelink', 'Madness', 'Magecraft',
       'Meld', 'Menace', 'Mentor', 'Mill', 'Mutate', 'Nightbound', 'Ninjutsu',
       'Pack tactics', 'Proliferate', 'Protection', 'Prowess', 'Raid', 'Rally',
       'Reach', 'Recon

Similarly, Artifact and Enchantment types are going to end up being important to our analysis, so we'll create dummy values for those.

In [40]:
creatures.types.unique()

array(['Creature', 'Artifact, Creature', 'Enchantment, Creature'],
      dtype=object)

In [41]:
creatures = pd.concat([creatures, creatures.types.str.get_dummies(sep=", ")], axis=1)

In [42]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Training,Trample,Transform,Undergrowth,Venture,Vigilance,Ward,Artifact,Creature,Enchantment
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,1,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,1,0,0,1,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,1,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,1,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,1,0


In [43]:
creatures.shape

(3791, 98)

In [44]:
creatures.columns

Index(['name', 'colors', 'manaValue', 'power', 'toughness', 'text', 'rarity',
       'keywords', 'supertypes', 'types', 'subtypes', 'Adamant', 'Adapt',
       'Afflict', 'Afterlife', 'Alliance', 'Amass', 'Ascend', 'Blitz', 'Boast',
       'Changeling', 'Channel', 'Cohort', 'Companion', 'Connive',
       'Constellation', 'Converge', 'Convoke', 'Coven', 'Cycling', 'Daybound',
       'Deathtouch', 'Defender', 'Delirium', 'Devoid', 'Disturb',
       'Double strike', 'Embalm', 'Emerge', 'Enrage', 'Equip', 'Escape',
       'Eternalize', 'Exert', 'Exploit', 'Explore', 'Fabricate', 'Fight',
       'First strike', 'Flash', 'Flying', 'Foretell', 'Haste', 'Hexproof',
       'Hexproof from', 'Improvise', 'Indestructible', 'Ingest', 'Investigate',
       'Kicker', 'Landfall', 'Learn', 'Lifelink', 'Madness', 'Magecraft',
       'Meld', 'Menace', 'Mentor', 'Mill', 'Mutate', 'Nightbound', 'Ninjutsu',
       'Pack tactics', 'Proliferate', 'Protection', 'Prowess', 'Raid', 'Rally',
       'Reach', 'Recon

In [45]:
creatures.supertypes.unique()

array([nan, 'Legendary', 'Snow', 'Legendary, Snow'], dtype=object)

In [None]:
#creatures = pd.concat([creatures, creatures.supertypes.str.get_dummies(sep=", ")], axis=1)

In [None]:
#creatures.shape

In [46]:
creatures.subtypes.unique()

array(['Human, Pilot', 'Dwarf, Artificer', 'Dwarf, Scout', 'Dwarf, Rogue',
       'Dwarf, Soldier', 'Bird', 'Dwarf, Warrior', 'Angel', 'Cat, Beast',
       'Dwarf, Advisor', 'Vedalken, Artificer', 'Whale', 'Human, Wizard',
       'Drake', 'Human, Artificer', 'Crab', 'Vedalken, Rogue', 'Fish',
       'Human, Pirate', 'Aetherborn, Rogue', 'Aetherborn, Artificer',
       'Insect', 'Human, Rogue', 'Aetherborn, Vampire', 'Demon',
       'Aetherborn, Warrior', 'Human, Warrior', 'Giant', 'Dragon',
       'Lizard', 'Gremlin', 'Elf, Artificer, Druid', 'Cat', 'Elf, Druid',
       'Elephant', 'Elf, Warrior', 'Human, Druid', 'Pangolin, Beast',
       'Cat, Monkey', 'Elf, Archer', 'Boar', 'Dwarf, Pilot',
       'Human, Soldier', 'Snake', 'Construct', 'Juggernaut',
       'Assembly-Worker', 'Thopter', 'Shapeshifter', 'Elf, Soldier',
       'Golem', 'Pegasus', 'Dog', 'Unicorn', 'Gargoyle', 'Human, Cleric',
       'Orc, Knight', 'Human, Monk', 'Jellyfish', 'Dwarf, Ranger',
       'Spirit, Knight', 'Hu

In [48]:
creatures.shape

(3791, 289)

In [49]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,0,0,0,0,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#creatures = pd.concat([creatures, creatures.colors.str.get_dummies(sep=", ")], axis=1)

In [None]:
#creatures.shape

In [None]:
#creatures.colors.unique()

In [50]:
creatures_transformed = creatures.drop(columns=['rarity', 'keywords', 'supertypes', 'types', 'subtypes', 'Creature'])

In [51]:
creatures_transformed.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,Adamant,Adapt,Afflict,Afterlife,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
creatures_transformed.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3791 entries, 0 to 3790
Data columns (total 283 columns):
 #    Column           Dtype  
---   ------           -----  
 0    name             object 
 1    colors           object 
 2    manaValue        float64
 3    power            float64
 4    toughness        float64
 5    text             object 
 6    Adamant          int64  
 7    Adapt            int64  
 8    Afflict          int64  
 9    Afterlife        int64  
 10   Alliance         int64  
 11   Amass            int64  
 12   Ascend           int64  
 13   Blitz            int64  
 14   Boast            int64  
 15   Changeling       int64  
 16   Channel          int64  
 17   Cohort           int64  
 18   Companion        int64  
 19   Connive          int64  
 20   Constellation    int64  
 21   Converge         int64  
 22   Convoke          int64  
 23   Coven            int64  
 24   Cycling          int64  
 25   Daybound         int64  
 26   Deathtouch       i

In [53]:
creatures_transformed.describe()

Unnamed: 0,manaValue,power,toughness,Adamant,Adapt,Afflict,Afterlife,Alliance,Amass,Ascend,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
count,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,...,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0
mean,3.432867,2.682933,2.906621,0.001846,0.003165,0.002374,0.002638,0.002638,0.002374,0.003693,...,0.000528,0.027697,0.001319,0.063308,0.010288,0.000528,0.000264,0.004748,0.000264,0.045634
std,1.59076,1.585795,1.590949,0.042937,0.05618,0.048673,0.051299,0.051299,0.048673,0.060665,...,0.022966,0.164125,0.036298,0.243548,0.100918,0.022966,0.016241,0.068752,0.016241,0.208718
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.0,13.0,17.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


This is looking like we could actually start to use it for some machine learning analysis, so I'm going to export it to a CSV and pick up in a new file.

In [None]:
creatures_transformed.to_csv('cleaned_datasets/creatures_transformed.csv')