# Step 1: Load Data Set

In [1]:
import pandas as pd

card_df = pd.read_json('bulk_data_default_cards.json')

# Take a look at the dataset
card_df.head(10)

Unnamed: 0,object,id,oracle_id,multiverse_ids,mtgo_id,mtgo_foil_id,tcgplayer_id,cardmarket_id,name,lang,...,tcgplayer_etched_id,attraction_lights,color_indicator,life_modifier,hand_modifier,printed_type_line,printed_text,content_warning,flavor_name,variation_of
0,card,0000579f-7b35-4ed3-b44c-db2a538066fe,44623693-51d6-49ad-8cd7-140505caf02f,[109722],25527.0,25528.0,14240.0,13850.0,Fury Sliver,en,...,,,,,,,,,,
1,card,00006596-1166-4a79-8443-ca9f82e6db4e,8ae3562f-28b7-4462-96ed-be0cf7052ccc,[189637],34586.0,34587.0,33347.0,21851.0,Kor Outfitter,en,...,,,,,,,,,,
2,card,0000a54c-a511-4925-92dc-01b937f9afad,dc4e2134-f0c2-49aa-9ea3-ebf83af1445c,[],,,98659.0,,Spirit,en,...,,,,,,,,,,
3,card,0000cd57-91fe-411f-b798-646e965eec37,9f0d82ae-38bf-45d8-8cda-982b6ead1d72,[435231],65170.0,65171.0,145764.0,301766.0,Siren Lookout,en,...,,,,,,,,,,
4,card,00012bd8-ed68-4978-a22d-f450c8a6e048,5aa12aff-db3c-4be5-822b-3afdf536b33e,[1278],,,1623.0,5664.0,Web,en,...,,,,,,,,,,
5,card,0001f1ef-b957-4a55-b47f-14839cdbab6f,ef027846-be81-4959-a6b5-56bd01b1e68a,[472997],78170.0,,198861.0,400134.0,Venerable Knight,en,...,,,,,,,,,,
6,card,00020b05-ecb9-4603-8cc1-8cfa7a14befc,d96ac790-428b-4a64-8dbd-6baa73eb6210,[394089],,,95585.0,272052.0,Wildcall,en,...,,,,,,,,,,
7,card,0002ab72-834b-4c81-82b1-0d2760ea96b0,645b5784-a6f7-4cf3-966a-e1a51420b96b,[488632],81171.0,,215418.0,467859.0,Mystic Skyfish,en,...,,,,,,,,,,
8,card,00030770-5e99-4943-819d-8d807c24cc14,56719f6a-1a6c-4c0a-8d21-18f7d7350b68,[489643],,,215984.0,472559.0,Swamp,en,...,,,,,,,,,,
9,card,000366c8-7a43-49d7-a103-ac5bd7efd9aa,56719f6a-1a6c-4c0a-8d21-18f7d7350b68,[4925],9701.0,9702.0,18500.0,9070.0,Swamp,en,...,,,,,,,,,,


In [2]:
card_df.shape

(74412, 84)

In [3]:
# We only want cards printed in English
card_df = card_df.loc[card_df['lang'] == 'en']

card_df.shape

(72168, 84)

In [4]:
# We limit ourself to commander legal cards only
card_df['legalities'] = card_df['legalities'].apply(lambda x: x['commander'])

card_df = card_df.loc[card_df['legalities'] == 'legal']

card_df.shape

(66419, 84)

In [6]:
# Remove double faced cards and other nonsense (their 'color' columns contains 'nan', as their color is split between two card faces)
error_index = [i for i, x in card_df.iterrows() if type(x.colors) == float]

card_df.drop(index=error_index, inplace=True)

card_df.shape

In [5]:
# Reduce dataset to important attributes only
card_df = card_df[['name', 'oracle_text', 'colors', 'color_identity']]

card_df.head()

Unnamed: 0,name,oracle_text,colors,color_identity
0,Fury Sliver,All Sliver creatures have double strike.,[R],[R]
1,Kor Outfitter,"When Kor Outfitter enters the battlefield, you...",[W],[W]
3,Siren Lookout,Flying\nWhen Siren Lookout enters the battlefi...,[U],[U]
4,Web,Enchant creature (Target a creature as you cas...,[G],[G]
5,Venerable Knight,"When Venerable Knight dies, put a +1/+1 counte...",[W],[W]
6,Wildcall,"Manifest the top card of your library, then pu...",[G],[G]
7,Mystic Skyfish,"Whenever you draw your second card each turn, ...",[U],[U]
8,Swamp,({T}: Add {B}.),[],[B]
9,Swamp,({T}: Add {B}.),[],[B]
10,Battlewing Mystic,Kicker {R} (You may pay an additional {R} as y...,[U],"[R, U]"


In [7]:
# Make Colors a One-Hot-Encoded attribute
colors = ['W', 'U', 'B', 'R', 'G', 'C']

for color in colors:
    card_df[color] = False
    
card_df.head()

Unnamed: 0,name,oracle_text,colors,color_identity,W,U,B,R,G,C
0,Fury Sliver,All Sliver creatures have double strike.,[R],[R],False,False,False,False,False,False
1,Kor Outfitter,"When Kor Outfitter enters the battlefield, you...",[W],[W],False,False,False,False,False,False
3,Siren Lookout,Flying\nWhen Siren Lookout enters the battlefi...,[U],[U],False,False,False,False,False,False
4,Web,Enchant creature (Target a creature as you cas...,[G],[G],False,False,False,False,False,False
5,Venerable Knight,"When Venerable Knight dies, put a +1/+1 counte...",[W],[W],False,False,False,False,False,False


In [24]:
# Fill the color columns with their respective values
for color in colors:
    card_df[color] = card_df['colors'].apply(lambda x: True if color == 'C' and not x else (color in x))

card_df.head()

Unnamed: 0,name,oracle_text,colors,color_identity,W,U,B,R,G,C
0,Fury Sliver,All Sliver creatures have double strike.,[R],[R],False,False,False,True,False,False
1,Kor Outfitter,"When Kor Outfitter enters the battlefield, you...",[W],[W],True,False,False,False,False,False
3,Siren Lookout,Flying\nWhen Siren Lookout enters the battlefi...,[U],[U],False,True,False,False,False,False
4,Web,Enchant creature (Target a creature as you cas...,[G],[G],False,False,False,False,True,False
5,Venerable Knight,"When Venerable Knight dies, put a +1/+1 counte...",[W],[W],True,False,False,False,False,False
6,Wildcall,"Manifest the top card of your library, then pu...",[G],[G],False,False,False,False,True,False
7,Mystic Skyfish,"Whenever you draw your second card each turn, ...",[U],[U],False,True,False,False,False,False
8,Swamp,({T}: Add {B}.),[],[B],False,False,False,False,False,True
9,Swamp,({T}: Add {B}.),[],[B],False,False,False,False,False,True
10,Battlewing Mystic,Kicker {R} (You may pay an additional {R} as y...,[U],"[R, U]",False,True,False,False,False,False


# Step 2: Create Test, Train, Eval Split

#### Color vs. Color Identity


The color identity of a card is the combination of all colors in its mana cost, any color indicator or color-setting characteristic-defining abilities on the card and any mana symbols in the card's rules text. When determining a card's color identity, any mana symbols in the card's reminder text are ignored.  
Example: Basic Lands do not have a color, but their color identity is the color of whatever mana they produce. Swamps are colorless cards, but produce black mana so their color identity is black.


**Tought:** ***How will Lands influence the model?***