In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
all_cards = pd.read_csv('cleaned_datasets/allsets_cleaned.csv', index_col=0)

In [3]:
all_cards.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aerial Modification,W,5.0,{4}{W},,,,Enchant creature or Vehicle\nAs long as enchan...,uncommon,Enchant,,Enchantment,Aura
1,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
2,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
3,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
4,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"


In [4]:
all_cards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7023 entries, 0 to 7022
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        7023 non-null   object 
 1   colors      7023 non-null   object 
 2   manaValue   7023 non-null   float64
 3   manaCost    6515 non-null   object 
 4   power       3847 non-null   float64
 5   toughness   3857 non-null   float64
 6   loyalty     167 non-null    float64
 7   text        6923 non-null   object 
 8   rarity      7023 non-null   object 
 9   keywords    3376 non-null   object 
 10  supertypes  753 non-null    object 
 11  types       7023 non-null   object 
 12  subtypes    4610 non-null   object 
dtypes: float64(4), object(9)
memory usage: 768.1+ KB


NaN values in the `keywords` column are to be expected, as many cards do not have keywords.  We will replace these with empty strings.

In [5]:
all_cards['keywords'].fillna('', inplace=True)

I want to get a dataframe that includes only creatures, but first we have to deal with some edge cases:  first, non-creature cards that contain the "Transform" keyword.  I do this by writing out a function that can extract the mana value for cards that are formatted in the following way: "{B}{B}{2}: Transform".  We find the Transform keyword in the text of the card, then grab the string that immediately precedes it, calculate the additional cost to transform the card and add it to the correct cell (which will, in all cases, be the row immediately following the card with the Transform keyword.  This also fixes the mana value for the back sides of modal cards with the Transform keyword that are both creatures.)

In [6]:
all_cards['text'].fillna('', inplace=True)

In [7]:
all_cards.loc[all_cards.keywords.str.contains('Transform', case=False)]

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
1517,Extricator of Sin,W,3.0,{2}{W},0.0,3.0,,"When Extricator of Sin enters the battlefield,...",uncommon,"Delirium, Transform",,Creature,"Human, Cleric"
1528,Lone Rider,W,2.0,{1}{W},1.0,1.0,,"First strike, lifelink\nAt the beginning of th...",uncommon,"First strike, Lifelink, Transform",,Creature,"Human, Knight"
1550,Curious Homunculus,U,2.0,{1}{U},1.0,1.0,,{T}: Add {C}. Spend this mana only to cast an ...,uncommon,Transform,,Creature,Homunculus
1553,Docent of Perfection,U,5.0,{3}{U}{U},5.0,4.0,,Flying\nWhenever you cast an instant or sorcer...,rare,"Flying, Transform",,Creature,"Insect, Horror"
1561,Grizzled Angler,U,3.0,{2}{U},2.0,3.0,,{T}: Mill two cards. Then if there is a colorl...,uncommon,"Mill, Transform",,Creature,Human
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6712,Primal Wellspring,L,4.0,,,,,(Transforms from Primal Amulet.)\n{T}: Add one...,rare,Transform,,Land,
6717,Thaumatic Compass,C,2.0,{2},,,,"{3}, {T}: Search your library for a basic land...",rare,Transform,,Artifact,
6718,Spires of Orazca,L,2.0,,,,,(Transforms from Thaumatic Compass.)\n{T}: Add...,rare,Transform,,Land,
6719,Treasure Map,C,2.0,{2},,,,"{1}, {T}: Scry 1. Put a landmark counter on Tr...",rare,"Scry, Transform",,Artifact,


In [8]:
def backwards_extract (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text before the keyword
        if c <= 12:
            text_value = text[:c]
        else:
            text_value = text[c-12:c-2]

        # check to see that a curly brace exists before the keyword, confirming
        # that the text directly before it indicates an additional mana cost
        if '}' in text_value:
            text_value = text_value[text_value.find('{'):]

            # remove curly braces, Tap symbols, whitespace, colons
            text_value = re.sub('[,T:\s}{]', '', text_value)            

            # calculate value to use in dataframe 
            new_value = len(text_value) 
            if text_value[0].isdigit():
                new_value += int(text_value[0]) - 1

            # update the value of the NEXT card in the dataframe, the one that
            # our identified cards transform into
            df.iat[indexes[count] + 1, 2] += new_value
            
            # print a list of changes made so we can confirm we got it right
            print(df.iloc[indexes[count] +1, :4])

In [9]:
backwards_extract(all_cards, 'Transform')

name         Conduit of Emrakul
colors                        C
manaValue                   8.0
manaCost                    NaN
Name: 1625, dtype: object
name         Erupting Dreadwolf
colors                        C
manaValue                  10.0
manaCost                    NaN
Name: 1645, dtype: object
name         Dronepack Kindred
colors                       C
manaValue                 12.0
manaCost                   NaN
Name: 1652, dtype: object
name         Sinuous Predator
colors                      C
manaValue                 6.0
manaCost                  NaN
Name: 1668, dtype: object
name         Howling Chorus
colors                    C
manaValue               9.0
manaCost                NaN
Name: 1673, dtype: object
name         Fibrous Entangler
colors                       C
manaValue                 11.0
manaCost                   NaN
Name: 1680, dtype: object
name         Ulvenwald Abomination
colors                           C
manaValue                      9.0
man

We have a very similar problem with cards that have the "Disturb" mechanic, but the added difficulty that the mechanic is templated differently than the Transform mechanic (in that the keyword "Disturb" precedes the mana cost, where as the keyword "Transform" follows the mana cost).  So, below, I have a slightly tweaked function that grabs the mana value following the keyword, and we'll run that to update mana values for our Disturb creatures.

In [10]:
def extract_mana_value (df, keyword):
    import re
    
    # store the relevant index numbers from the dataframe
    indexes = df.loc[df.text.str.contains(keyword)].index

    # loop over each element to be changed
    for count, text in enumerate(df.loc[df.text.str.contains(keyword)].text):

        # find occurence of keyword
        c = text.find(keyword)

        # capture the text after the keyword that lists the mana cost of casting 
        # the card, leaving out leading and trailing curly braces.
        text_value = text[c+9:text.rfind('}', c+8, c+20)]

        # remove the rest of the curly braces
        text_value = re.sub('[,T:\s}{]', '', text_value)

        # calculate value to use in dataframe 
        new_value = len(text_value) 
        if text_value[0].isdigit():
            new_value += int(text_value[0]) - 1

        # update the value in the dataframe
        df.iat[indexes[count] + 1, 2] = new_value
        
        # print a list of changes made so we can confirm we got it right
        print(df.iloc[indexes[count] +1, :4])

In [11]:
all_cards.loc[all_cards.text.str.contains('Disturb')].head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
2967,Beloved Beggar,W,2.0,{1}{W},0.0,4.0,,Disturb {4}{W}{W} (You may cast this card from...,uncommon,"Disturb, Transform",,Creature,"Human, Peasant"
2980,Chaplain of Alms,W,1.0,{W},1.0,1.0,,First strike\nWard {1} (Whenever this creature...,uncommon,"Disturb, First strike, Transform, Ward",,Creature,"Human, Cleric"
2996,Lunarch Veteran,W,1.0,{W},1.0,1.0,,Whenever another creature enters the battlefie...,common,"Disturb, Transform",,Creature,"Human, Cleric"
2998,Mourning Patrol,W,3.0,{2}{W},2.0,3.0,,Vigilance\nDisturb {3}{W} (You may cast this c...,common,"Disturb, Transform, Vigilance",,Creature,"Human, Soldier"
3013,Baithook Angler,U,2.0,{1}{U},2.0,1.0,,Disturb {1}{U} (You may cast this card from yo...,common,"Disturb, Transform",,Creature,"Human, Peasant"


In [12]:
extract_mana_value(all_cards, 'Disturb')

name         Generous Soul
colors                   W
manaValue              6.0
manaCost               NaN
Name: 2968, dtype: object
name         Chapel Shieldgeist
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 2981, dtype: object
name         Luminous Phantom
colors                      W
manaValue                 2.0
manaCost                  NaN
Name: 2997, dtype: object
name         Morning Apparition
colors                        W
manaValue                   4.0
manaCost                    NaN
Name: 2999, dtype: object
name         Hook-Haunt Drifter
colors                        U
manaValue                   2.0
manaCost                    NaN
Name: 3014, dtype: object
name         Ghostly Castigator
colors                        U
manaValue                   5.0
manaCost                    NaN
Name: 3018, dtype: object
name         Waildrifter
colors                 U
manaValue            5.0
manaCost             NaN
Name

With those adjustments made, we should be able to now grab a new dataframe that has only the creatures in it.

In [13]:
creatures = all_cards.loc[all_cards.types.str.contains('Creature')].copy()
creatures.reset_index(drop=True, inplace=True)

In [14]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3824 entries, 0 to 3823
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3824 non-null   object 
 1   colors      3824 non-null   object 
 2   manaValue   3824 non-null   float64
 3   manaCost    3677 non-null   object 
 4   power       3794 non-null   float64
 5   toughness   3804 non-null   float64
 6   loyalty     0 non-null      float64
 7   text        3824 non-null   object 
 8   rarity      3824 non-null   object 
 9   keywords    3824 non-null   object 
 10  supertypes  452 non-null    object 
 11  types       3824 non-null   object 
 12  subtypes    3824 non-null   object 
dtypes: float64(4), object(9)
memory usage: 388.5+ KB


In [15]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,loyalty,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


The column "Loyalty" is a value that is unique to planeswalkers, so we're going to drop that right away.

In [16]:
creatures.drop(['loyalty'], axis=1, inplace=True)

In [17]:
creatures.head()

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,{3}{W},3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,{3}{W},2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,{1}{W},3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,{3}{W}{W},4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,{1}{W},3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


There are 147 rows with no value for manaCost.  We have zero rows that have NaN values in manaValue, though, so these are almost definitely the backs of modal cards that transform through one game mechanic or another.  The amount of resources spent to cast the card is accurately reflected in the manaValue column, though additional conditions for flipping the card are not going to be considered at this stage of our investigation.  Still, we'll do the same as we did above and check that this is true of about 10% of the data before dropping the column to make sure we didn't inadvertently introduce any errors.

In [18]:
creatures.manaCost.fillna('', inplace=True)

In [19]:
len(creatures[creatures.manaCost.eq('')])

147

In [20]:
creatures[creatures.manaCost.eq('')].sample(15)

Unnamed: 0,name,colors,manaValue,manaCost,power,toughness,text,rarity,keywords,supertypes,types,subtypes
1625,Leeching Lurker,B,3.0,,4.0,4.0,Lifelink\nNightbound (If a player casts at lea...,rare,"Lifelink, Nightbound",,Creature,"Leech, Horror"
1676,Tavern Smasher,R,4.0,,6.0,5.0,Nightbound (If a player casts at least two spe...,common,Nightbound,,Creature,Werewolf
1722,Lord of the Ulvenwald,"G, R",2.0,,3.0,3.0,Other Wolves and Werewolves you control get +1...,uncommon,Nightbound,,Creature,Werewolf
1678,Village Reavers,R,5.0,,5.0,4.0,Wolves and Werewolves you control have haste.\...,uncommon,"Haste, Nightbound",,Creature,Werewolf
2920,Skin Shedder,R,1.0,,3.0,4.0,,uncommon,,,Creature,"Insect, Horror"
3266,Cackling Culprit,B,2.0,,3.0,5.0,Whenever Cackling Culprit or another creature ...,uncommon,,,Creature,"Human, Rogue"
3318,Depraved Harvester,B,3.0,,4.0,3.0,Lifelink,common,Lifelink,,Creature,"Human, Knight"
2926,Ancient of the Equinox,G,3.0,,4.0,4.0,"Trample, hexproof",uncommon,"Hexproof, Trample",,Creature,Treefolk
1624,Covetous Geist,B,5.0,,2.0,2.0,"Flying, deathtouch\nIf Covetous Geist would be...",uncommon,"Deathtouch, Flying",,Creature,"Spirit, Rogue"
1662,Harvesttide Assailant,R,3.0,,4.0,4.0,Trample\nNightbound (If a player casts at leas...,common,"Nightbound, Trample",,Creature,Werewolf


At this point, we should have dealt with most or all of the incorrectly assigned manaValue creatures, though we are not accounting for non-mana costs.  Doing so would be significantly more complicated and goes beyond the scope of the current investigation, so I'm happy to move on to other things that need to be corrected.  At this point, we should not need the manaCost column any longer, so I'll drop it.

In [21]:
creatures.drop(columns='manaCost', inplace=True)

In [22]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,subtypes
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,"Human, Pilot"
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,"Dwarf, Artificer"
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,"Dwarf, Scout"
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,"Dwarf, Scout"
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,"Dwarf, Rogue"


There is a small subset of creatures whose power and/or toughness is listed as an asterisk.  This means that those values are variable and dependent on game-state conditions.  We've only got 33 of these creatures in our dataset of almost 4,000, comprising less than 1% of the creatures.  Rather than spend an inordinate amount of time accounting for these edge cases, I choose to drop them from the dataframe.

In [23]:
creatures[creatures.power.isnull() | creatures.toughness.isnull()].head(10)

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,subtypes
351,Enigma Drake,"R, U",3.0,,4.0,Flying\nEnigma Drake's power is equal to the n...,uncommon,Flying,,Creature,Drake
443,Vile Aggregate,C,3.0,,5.0,Devoid (This card has no color.)\nVile Aggrega...,uncommon,"Devoid, Ingest, Trample",,Creature,"Eldrazi, Drone"
499,Veteran Warleader,"G, W",3.0,,,Veteran Warleader's power and toughness are ea...,rare,,,Creature,"Human, Soldier, Ally"
676,"Syr Elenora, the Discerning",U,5.0,,4.0,"Syr Elenora, the Discerning's power is equal t...",uncommon,,Legendary,Creature,"Human, Knight"
724,Beanstalk Giant,G,7.0,,,Beanstalk Giant's power and toughness are each...,uncommon,,,Creature,Giant
756,Wintermoor Commander,"B, W",2.0,2.0,,Deathtouch\nWintermoor Commander's toughness i...,uncommon,Deathtouch,,Creature,"Human, Knight"
775,Shambling Suit,C,3.0,,3.0,Shambling Suit's power is equal to the number ...,uncommon,,,"Artifact, Creature",Construct
1012,Crackling Drake,"R, U",4.0,,4.0,Flying\nCrackling Drake's power is equal to th...,uncommon,Flying,,Creature,Drake
1090,Apocalypse Demon,B,6.0,,,Flying\nApocalypse Demon's power and toughness...,rare,Flying,,Creature,Demon
1128,Majestic Myriarch,G,5.0,,,Majestic Myriarch's power and toughness are ea...,mythic,,,Creature,Chimera


In [24]:
len(creatures[creatures.power.isnull() | creatures.toughness.isnull()])

33

In [25]:
creatures.drop(creatures[creatures.power.isnull() | creatures.toughness.isnull()].index, inplace=True)
creatures.reset_index(drop=True, inplace=True)

In [26]:
creatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3791 entries, 0 to 3790
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3791 non-null   object 
 1   colors      3791 non-null   object 
 2   manaValue   3791 non-null   float64
 3   power       3791 non-null   float64
 4   toughness   3791 non-null   float64
 5   text        3791 non-null   object 
 6   rarity      3791 non-null   object 
 7   keywords    3791 non-null   object 
 8   supertypes  442 non-null    object 
 9   types       3791 non-null   object 
 10  subtypes    3791 non-null   object 
dtypes: float64(3), object(8)
memory usage: 325.9+ KB


In [27]:
# Creating a copy of our cleaned creature dataset before we do categorical transformations.
# creatures.to_csv('cleaned_datasets/creatures_only.csv')

As we saw in our EDA, certain keywords like flying, haste, trample and flash can be fairly indicitave of a creature's color identity.  We want to create binary columns to indicate the presence of these categorical features, so we'll use the `get_dummies` method to achieve this.

In [28]:
creatures = pd.concat([creatures, creatures.keywords.str.get_dummies(sep=", ")], axis=1)

In [29]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Support,Surge,Surveil,Training,Trample,Transform,Undergrowth,Venture,Vigilance,Ward
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,0,0,0,1,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,0,0


In [30]:
creatures.shape

(3791, 95)

In [31]:
creatures.columns

Index(['name', 'colors', 'manaValue', 'power', 'toughness', 'text', 'rarity',
       'keywords', 'supertypes', 'types', 'subtypes', 'Adamant', 'Adapt',
       'Afflict', 'Afterlife', 'Alliance', 'Amass', 'Ascend', 'Blitz', 'Boast',
       'Changeling', 'Channel', 'Cohort', 'Companion', 'Connive',
       'Constellation', 'Converge', 'Convoke', 'Coven', 'Cycling', 'Daybound',
       'Deathtouch', 'Defender', 'Delirium', 'Devoid', 'Disturb',
       'Double strike', 'Embalm', 'Emerge', 'Enrage', 'Equip', 'Escape',
       'Eternalize', 'Exert', 'Exploit', 'Explore', 'Fabricate', 'Fight',
       'First strike', 'Flash', 'Flying', 'Foretell', 'Haste', 'Hexproof',
       'Hexproof from', 'Improvise', 'Indestructible', 'Ingest', 'Investigate',
       'Kicker', 'Landfall', 'Learn', 'Lifelink', 'Madness', 'Magecraft',
       'Meld', 'Menace', 'Mentor', 'Mill', 'Mutate', 'Nightbound', 'Ninjutsu',
       'Pack tactics', 'Proliferate', 'Protection', 'Prowess', 'Raid', 'Rally',
       'Reach', 'Recon

Similarly, Artifact and Enchantment types are going to end up being important to our analysis, so we'll create dummy values for those.

In [32]:
creatures.types.unique()

array(['Creature', 'Artifact, Creature', 'Enchantment, Creature'],
      dtype=object)

In [33]:
creatures = pd.concat([creatures, creatures.types.str.get_dummies(sep=", ")], axis=1)

In [34]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Training,Trample,Transform,Undergrowth,Venture,Vigilance,Ward,Artifact,Creature,Enchantment
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,1,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,1,0,0,1,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,1,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,1,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,1,0


In [35]:
creatures.shape

(3791, 98)

In [36]:
creatures.columns

Index(['name', 'colors', 'manaValue', 'power', 'toughness', 'text', 'rarity',
       'keywords', 'supertypes', 'types', 'subtypes', 'Adamant', 'Adapt',
       'Afflict', 'Afterlife', 'Alliance', 'Amass', 'Ascend', 'Blitz', 'Boast',
       'Changeling', 'Channel', 'Cohort', 'Companion', 'Connive',
       'Constellation', 'Converge', 'Convoke', 'Coven', 'Cycling', 'Daybound',
       'Deathtouch', 'Defender', 'Delirium', 'Devoid', 'Disturb',
       'Double strike', 'Embalm', 'Emerge', 'Enrage', 'Equip', 'Escape',
       'Eternalize', 'Exert', 'Exploit', 'Explore', 'Fabricate', 'Fight',
       'First strike', 'Flash', 'Flying', 'Foretell', 'Haste', 'Hexproof',
       'Hexproof from', 'Improvise', 'Indestructible', 'Ingest', 'Investigate',
       'Kicker', 'Landfall', 'Learn', 'Lifelink', 'Madness', 'Magecraft',
       'Meld', 'Menace', 'Mentor', 'Mill', 'Mutate', 'Nightbound', 'Ninjutsu',
       'Pack tactics', 'Proliferate', 'Protection', 'Prowess', 'Raid', 'Rally',
       'Reach', 'Recon

In [37]:
creatures.supertypes.unique()

array([nan, 'Legendary', 'Snow', 'Legendary, Snow'], dtype=object)

In [38]:
#creatures = pd.concat([creatures, creatures.supertypes.str.get_dummies(sep=", ")], axis=1)

In [39]:
#creatures.shape

In [40]:
creatures.subtypes.unique()

array(['Human, Pilot', 'Dwarf, Artificer', 'Dwarf, Scout', 'Dwarf, Rogue',
       'Dwarf, Soldier', 'Bird', 'Dwarf, Warrior', 'Angel', 'Cat, Beast',
       'Dwarf, Advisor', 'Vedalken, Artificer', 'Whale', 'Human, Wizard',
       'Drake', 'Human, Artificer', 'Crab', 'Vedalken, Rogue', 'Fish',
       'Human, Pirate', 'Aetherborn, Rogue', 'Aetherborn, Artificer',
       'Insect', 'Human, Rogue', 'Aetherborn, Vampire', 'Demon',
       'Aetherborn, Warrior', 'Human, Warrior', 'Giant', 'Dragon',
       'Lizard', 'Gremlin', 'Elf, Artificer, Druid', 'Cat', 'Elf, Druid',
       'Elephant', 'Elf, Warrior', 'Human, Druid', 'Pangolin, Beast',
       'Cat, Monkey', 'Elf, Archer', 'Boar', 'Dwarf, Pilot',
       'Human, Soldier', 'Snake', 'Construct', 'Juggernaut',
       'Assembly-Worker', 'Thopter', 'Shapeshifter', 'Elf, Soldier',
       'Golem', 'Pegasus', 'Dog', 'Unicorn', 'Gargoyle', 'Human, Cleric',
       'Orc, Knight', 'Human, Monk', 'Jellyfish', 'Dwarf, Ranger',
       'Spirit, Knight', 'Hu

In [41]:
creatures = pd.concat([creatures, creatures.subtypes.str.get_dummies(sep=', ')], axis=1)

In [42]:
creatures.shape

(3791, 289)

In [43]:
creatures.head()

Unnamed: 0,name,colors,manaValue,power,toughness,text,rarity,keywords,supertypes,types,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
0,Aeronaut Admiral,W,4.0,3.0,1.0,Flying\nVehicles you control have flying.,uncommon,Flying,,Creature,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,Vigilance\nWhen Aether Inspector enters the ba...,common,Vigilance,,Creature,...,0,0,0,0,0,0,0,0,0,0
2,Aethergeode Miner,W,2.0,3.0,1.0,"Whenever Aethergeode Miner attacks, you get {E...",rare,,,Creature,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,Flying\nRevolt — When Airdrop Aeronauts enters...,uncommon,"Flying, Revolt",,Creature,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,Audacious Infiltrator can't be blocked by arti...,common,,,Creature,...,0,0,0,0,0,0,0,0,0,0


In [44]:
#creatures = pd.concat([creatures, creatures.colors.str.get_dummies(sep=", ")], axis=1)

In [45]:
#creatures.shape

In [46]:
#creatures.colors.unique()

In [47]:
creatures_transformed = creatures.drop(columns=['rarity', 'keywords', 'text', 'supertypes', 'types', 'subtypes', 'Creature'])

In [48]:
creatures_transformed.head()

Unnamed: 0,name,colors,manaValue,power,toughness,Adamant,Adapt,Afflict,Afterlife,Alliance,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
0,Aeronaut Admiral,W,4.0,3.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aether Inspector,W,4.0,2.0,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aethergeode Miner,W,2.0,3.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Airdrop Aeronauts,W,5.0,4.0,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Audacious Infiltrator,W,2.0,3.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
creatures_transformed.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3791 entries, 0 to 3790
Data columns (total 282 columns):
 #    Column           Dtype  
---   ------           -----  
 0    name             object 
 1    colors           object 
 2    manaValue        float64
 3    power            float64
 4    toughness        float64
 5    Adamant          int64  
 6    Adapt            int64  
 7    Afflict          int64  
 8    Afterlife        int64  
 9    Alliance         int64  
 10   Amass            int64  
 11   Ascend           int64  
 12   Blitz            int64  
 13   Boast            int64  
 14   Changeling       int64  
 15   Channel          int64  
 16   Cohort           int64  
 17   Companion        int64  
 18   Connive          int64  
 19   Constellation    int64  
 20   Converge         int64  
 21   Convoke          int64  
 22   Coven            int64  
 23   Cycling          int64  
 24   Daybound         int64  
 25   Deathtouch       int64  
 26   Defender         i

In [50]:
creatures_transformed.describe()

Unnamed: 0,manaValue,power,toughness,Adamant,Adapt,Afflict,Afterlife,Alliance,Amass,Ascend,...,Weird,Werewolf,Whale,Wizard,Wolf,Wolverine,Worm,Wurm,Yeti,Zombie
count,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,...,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0
mean,3.432867,2.682933,2.906621,0.001846,0.003165,0.002374,0.002638,0.002638,0.002374,0.003693,...,0.000528,0.027697,0.001319,0.063308,0.010288,0.000528,0.000264,0.004748,0.000264,0.045634
std,1.59076,1.585795,1.590949,0.042937,0.05618,0.048673,0.051299,0.051299,0.048673,0.060665,...,0.022966,0.164125,0.036298,0.243548,0.100918,0.022966,0.016241,0.068752,0.016241,0.208718
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.0,13.0,17.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
#Creating a first transformed csv before reducing the dataset to moncolored creatures only.
creatures_transformed.to_csv('cleaned_datasets/creatures_transformed.csv')

For our initial attempts at training models to identify a creatures color via it's mana value, power, toughness and keywords, it seems better to reduce the dataset to only include mono-colored creatures. Including every possible combination of multi-colored creature would require a lot more work, but is something to consider for the future.

In [52]:
creatures_transformed.colors.unique()

array(['W', 'U', 'B', 'R', 'G', 'R, U', 'G, R', 'G, W', 'R, W', 'G, U',
       'U, W', 'B, R', 'B, G', 'C', 'B, W', 'B, U', 'G, R, W',
       'B, G, R, U, W', 'B, G, R', 'R, U, W', 'B, G, U', 'W, G',
       'G, R, U', 'B, G, W', 'B, R, W', 'G, U, W', 'B, U, W', 'B, R, U',
       'G, R, U, W'], dtype=object)

In [53]:
creatures_transformed['numColors'] = creatures_transformed.colors.str.split(', ').str.len()

In [54]:
mono_creatures = creatures_transformed.query('numColors == 1').copy()

In [55]:
mono_creatures.colors.unique()

array(['W', 'U', 'B', 'R', 'G', 'C'], dtype=object)

In [56]:
mono_creatures.drop(columns='numColors', inplace=True)

In [57]:
mono_creatures.shape

(3244, 282)

In [59]:
mono_creatures.to_csv('cleaned_datasets/mono_creatures.csv')