In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.neural_network as sk_nn
import tensorflow as tf
from collections import Counter
import json
import pickle

In [5]:
color_dict_long = { # We want to make colorless its own color. This labelling system makes all color combos distinct, which will be more difficult for the classifier
    (): 0, # Colorless
    ("W",): 1, # White
    ("U",): 2, # Blue
    ("B",): 3, # Black
    ("R",): 4, # Red
    ("G",): 5, # Green
    ("G", "W"): 6, # Selesnya
    ("U", "W"): 7, # Azorius
    ("B", "U"): 8, # Dimir
    ("B", "R"): 9, # Rakdos
    ("G", "R"): 10, # Gruul
    ("B", "G"): 11, # Golgari
    ("B", "W"): 12, # Orzhov
    ("G", "U"): 13, # Simic
    ("R", "U"): 14, # Izzet
    ("R", "W"): 15, # Boros
    ("B", "G", "R"): 16,  # Jund
    ("B", "G", "U"): 17,  # Sultai
    ("B", "G", "W"): 18,  # Abzan
    ("B", "R", "U"): 19,  # Grixis
    ("B", "R", "W"): 20,  # Mardu
    ("B", "U", "W"): 21,  # Esper
    ("G", "R", "U"): 22,  # Temur
    ("G", "R", "W"): 23,  # Naya
    ("G", "U", "W"): 24,  # Bant
    ("R", "U", "W"): 25,  # Jeskai
    ("B", "G", "R", "U"): 26, # Whiteless (Yidris)
    ("B", "G", "R", "W"): 27, # Blueless (Saskia)
    ("B", "G", "U", "W"): 28, # Redless (Atraxa)
    ("B", "R", "U", "W"): 29, # Greenless (Breya)
    ("G", "R", "U", "W"): 30, # Blackless (Aragorn)
    ("B", "G", "R", "U", "W"): 31 # 5-Color
}

color_dict_short = {
    "": 0, # Colorless
    "W": 1, # White
    "U": 2, # Blue
    "B": 3, # Black
    "R": 4, # Red
    "G": 5, # Green
}

In [59]:
with open("./common_types.txt", "rb") as f:
       common_types = pickle.load(f)

cols_include_all = ['colorIdentity', 'colors', 'firstPrinting', 'keywords', 'manaCost', 'manaValue',
                     'subtypes', 'supertypes', 'text', 'type', 'types', 'power', 'toughness',  'colorIndicator',
                     'name', 'hasAlternativeDeckLimit']
cols_include_noncreature = ['loyalty',]
cols_edhrec = ["edhrecRank", "edhrecSaltiness"]
cols_legality = ['legalities.commander', 'legalities.duel', 'legalities.explorer',
       'legalities.historic', 'legalities.historicbrawl', 'legalities.legacy',
       'legalities.modern', 'legalities.oathbreaker', 'legalities.pauper',
       'legalities.paupercommander', 'legalities.penny', 'legalities.pioneer',
       'legalities.vintage', 'legalities.gladiator','legalities.alchemy',
       'legalities.brawl', 'legalities.future', 'legalities.standard', 'legalities.predh',
       'legalities.premodern', 'legalities.oldschool',]
cols_leadership = ['leadershipSkills.brawl',
       'leadershipSkills.commander', 'leadershipSkills.oathbreaker',]

def to_feature_name(s: str, typ: bool = False) -> str:
    if typ:
        return "f_ct_" + s.lower().replace(" ", "_")
    return "f_kw_" + s.lower().replace(" ", "_")

def get_kw_list(filename: str):
    """Get list of keywords from file"""
    with open(filename+".json") as f:
        json_data = json.load(f)
    data = json_data["data"]
    ability_words = data["abilityWords"]
    kw_abilities = data["keywordAbilities"]
    kw_actions = data["keywordActions"]
    all_kws = ability_words + kw_abilities + kw_actions
    return all_kws, ability_words, kw_abilities, kw_actions

def make_types_list(df: pd.DataFrame, n: int) -> None:
    """From a complete dataset, write a list of the 200 most common creature types to a file called common_types.txt"""
    all_types = []
    df["subtypes"].apply(all_types.extend)
    all_types = Counter(all_types)
    common_types = [x for x,y in all_types.most_common(n)]
    with open("./common_types.txt", "wb") as f:
        pickle.dump(common_types, f)

# make_types_list(df, 50)

# all_kws, _, _, _ = get_kw_list("../data/mtg/Keywords")
all_kws, _, _, _ = get_kw_list("./short_keywords")




In [61]:
def load_atomic(filename: str) -> pd.DataFrame:
    """Load from the Atomic standard files into a dataframe resembling the old data standard."""
    with open("../data/mtg/"+filename+".json") as f:
        json_data = json.load(f)  # Load from file
    json_data = json_data["data"]
    cards = [x[0] for x in json_data.values() if len(x) == 1] # Pull only cards with 1 face (no transform, fuse, split, flip cards, sorry Delver)
    df = pd.json_normalize(cards)
    return df

def prep_df(df: pd.DataFrame, monocolor: bool, creatures: bool, modern: bool) -> pd.DataFrame:
    """
    Preprocesses card DF
    @param df: Input DataFrame
    @param monocolor: If true, return only cards with 1 or less color
    @param creatures: If true, return only creatures
    @param modern: If true, filter by modern legality (excludes Uro :'( )
    """
    df["num_colors"] = df["colorIdentity"].map(len)
    if creatures:
        df = df.loc[df['type'].str.contains('Creature')]
    if monocolor:
        df = df.loc[df['num_colors'] <= 1]
    if modern:
        df = df.loc[df["legalities.modern"] == "Legal"]
    df = df[cols_include_all]
    df["f_is_artifact"] = df["supertypes"].apply(lambda x: 1 if "Artifact" in x else 0)
    df["f_is_enchantment"] = df["supertypes"].apply(lambda x: 1 if "Enchantment" in x else 0)
    df["f_cmc"] = (df["manaValue"] / 7.5) - 1  # [0,15] -> [-1, 1]
    df['f_pow'] = df['power'].replace({"1+*": 1, "*": "0", "*+1": 1}) # Assume all *'s are 0 (as per the rules)
    df['f_pow'] = ((df['f_pow'].astype(int) + 1) / 9) - 1 # [-1,16] -> [-1, 1]
    df['f_tough'] = df['toughness'].replace({"1+*": 1, "*": "0", "*+1": 1}) # Assume all *'s are 0 (as per the rules)
    df['f_tough'] = ((df['f_tough'].astype(int) + 1) / 9) - 1 # [-1,16] -> [-1, 1]

    df["label_identity"] = df["colorIdentity"].apply(lambda x: color_dict_long[tuple(x)]) # Could use 'colors' instead, but Kenrith should be classified as a 5C card, and Tasigur as Sultai.
    df["label_white"] = df["colorIdentity"].apply(lambda x: 1 if "W" in x else 0)
    df["label_blue"] = df["colorIdentity"].apply(lambda x: 1 if "U" in x else 0)
    df["label_black"] = df["colorIdentity"].apply(lambda x: 1 if "B" in x else 0)
    df["label_red"] = df["colorIdentity"].apply(lambda x: 1 if "R" in x else 0)
    df["label_green"] = df["colorIdentity"].apply(lambda x: 1 if "G" in x else 0)
    df["label_colorless"] = df["colorIdentity"].apply(lambda x: 1 if len(x) == 0 else 0)

    # Binary columns for types and keywords
    for kw in all_kws:
        feature = to_feature_name(kw)
        df[feature] = df["text"].str.contains(kw, case=False, na=0).astype(int)  # This works, but Death's Shadow counts as a creature with Shadow. Could look into using reminder text?
    for typ in common_types:
        feature = to_feature_name(typ, True)
        df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)
    df = df.set_index("name")
    return df

df = prep_df(load_atomic("ModernAtomic"), monocolor=True, creatures=True, modern=True)




  df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)
  df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)


In [82]:
# SKLearn MLP on mono-colored creatures
sk_clf = sk_nn.MLPClassifier(solver='sgd', hidden_layer_sizes=(10, 6), max_iter = 20000, verbose= True)

features = [x for x in list(df.columns) if x.startswith("f_")]
labels = [x for x in list(df.columns) if x.startswith("label_")]

ml_df = df[features + labels]

sk_scaler = sk.preprocessing.StandardScaler().fit(ml_df)

train, test = sk.model_selection.train_test_split(ml_df, test_size = 0.15)
train_features, train_labels = train[features], train[labels]
test_features, test_labels = test[features], test[labels]


5    1334
1    1286
3    1275
4    1197
2    1131
0     384
Name: label_identity, dtype: int64

In [83]:
sk_clf.fit(train_features, train_labels["label_identity"])

Iteration 1, loss = 1.77405959
Iteration 2, loss = 1.76346578
Iteration 3, loss = 1.75443874
Iteration 4, loss = 1.74729801
Iteration 5, loss = 1.74135065
Iteration 6, loss = 1.73608441
Iteration 7, loss = 1.73168572
Iteration 8, loss = 1.72762597
Iteration 9, loss = 1.72405248
Iteration 10, loss = 1.72066738
Iteration 11, loss = 1.71759387
Iteration 12, loss = 1.71467044
Iteration 13, loss = 1.71199366
Iteration 14, loss = 1.70954449
Iteration 15, loss = 1.70709184
Iteration 16, loss = 1.70463146
Iteration 17, loss = 1.70213973
Iteration 18, loss = 1.69961360
Iteration 19, loss = 1.69711150
Iteration 20, loss = 1.69459598
Iteration 21, loss = 1.69200684
Iteration 22, loss = 1.68928240
Iteration 23, loss = 1.68664537
Iteration 24, loss = 1.68388133
Iteration 25, loss = 1.68097745
Iteration 26, loss = 1.67799631
Iteration 27, loss = 1.67499286
Iteration 28, loss = 1.67194259
Iteration 29, loss = 1.66871439
Iteration 30, loss = 1.66531299
Iteration 31, loss = 1.66181611
Iteration 32, los

In [84]:
sk_clf.score(test_features, test_labels["label_identity"])

0.6392459297343616

MAIN GOAL: Determine a creature's color identity based on: (number of features)
- CMC (1)
- Power (1)
- Toughness (1)
- Artifact / Enchantment Supertype (2)
- Type (boolean cols for each of the top 200 tribes) (200)
- Keywords (see keywords.json and list of evergreen keywords on https://mtg.fandom.com/wiki/Evergreen) (20-200)
- Name? (Would need a way to break this down (https://web.stanford.edu/group/pdplab/pdphandbook/handbookch8.html))


In [80]:


evergreen_keywords = ["Activate", "Attach", "Cast", "Counter", "Create", "Destroy", "Discard", "Exchange", "Exile", "Fight",
                       "Mill", "Play", "Reveal", "Sacrifice", "Scry", "Search", "Shuffle", "Tap", "Untap"]
my_common_words = ["Enchantment", "Artifact", "+1/+1", "Token", "Draw" "Land", "Nonland", "Spell", "Creature",]
evergreen_abilities = ["Deathtouch", "Defender", "Double Strike", "Enchant", "Equip", "First Strike", "Flash", "Flying",
                        "Haste", "Hexproof", "Indestructible", "Lifelink", "Menace", "Protection", "Reach", "Trample",
                          "Vigilance", "Ward", "Regenerate", "Shroud", "Intimidate", "Prowess"]
all_keywords = evergreen_keywords + my_common_words + evergreen_abilities

# Issue #1: Multi-faced cards from the Atomic dataset.
The more robust atmoic dataset contains split entries for DFC's, fuse cards, etc. How do we count these cards?
A. Remove them from the dataset.
    # By far the easiest approach.
B. Look at just the front.
    # Cleanest, will cause some outliers, namely on meld/fuse/transform cards
C. Add them as an additional row.
    # More accurate, but will likely be outliers
D. Add extra columns
    # Most accurate, but will mess with any ML algo if not weighted properly.

# Issue #2: Keywords Overlap with Creature Names (Death's Shadow, Flying Men)
This is mostly fine. For one thing, many of these cards match color identity with their mechanic (Flying men are blue, DS is black, etc.)
We could look into some kind of way to differentiate based on regex or pattern matching, but let's leave that for now.   

# Issue 3: The word Counter
So, despite being able to name things whatever they want, and repeated oracle changes to simplify and clarify wording, MTG still uses the word 'counter' to mean two different things: a keyword action meaning "Remove this spell or ability from the stack", and a board object placed on permanents i.e. +1/+1 counters, loyalty counters. Countering things (first interpretation) is a blue-coded mechanic, while counters (second interpretation) are a fairly universal mechanic, maybe leaning white and green but with no real identity. Again, we could look into differentiating these by string matching ("counter target" vs "+1/+1 counter"), but since counters don't really have an identity, we're changing "counter" in keyword abilities to "counter target". There are also way too many strings about the other kind of counter to simply process (counter vs counters, etc.). This does remove Baral, the bluest creature ever, from the counter keyword column, but whatever.

# Issue 4: Parsing Card Names
This might be an entirely different ML task. There are packages to determine semantic vectors of words. Use the names of each card (sum of all word vectors) as a feature set. A card like Death's Shadow would be easy peasy. Brushwagg, on the other hand, maybe not. Proper names like Olivia Voldaren, or worse, Drizz't Dourden, would be all but impossible.

# Issue 5: Improving Performance
Current accuracy for testing dataset sits at about 64%- not great for a binary classifier, but pretty good for a classifier with 6 classes (chance rate of 17%). Let's try running a binary classifier on enemy-colors cards, say blue and green (probably the most disparate colors in terms of creatures).

In [94]:
simic_df = df.loc[df["colorIdentity"].isin([["G"], ["U"]])]
simic_df.head(10)

# SKLearn MLP on U and G creatures
sk_clf = sk_nn.MLPClassifier(solver='sgd', hidden_layer_sizes=(10, 6), max_iter = 10000, verbose= True)

features = [x for x in list(simic_df.columns) if x.startswith("f_")]
labels = [x for x in list(simic_df.columns) if x.startswith("label_")]

ml_df = simic_df[features + labels]

sk_scaler = sk.preprocessing.StandardScaler().fit(ml_df)

train, test = sk.model_selection.train_test_split(ml_df, test_size = 0.15)
train_features, train_labels = train[features], train[labels]
test_features, test_labels = test[features], test[labels]

sk_clf.fit(train_features, train_labels["label_identity"])

Iteration 1, loss = 0.72561082
Iteration 2, loss = 0.72075372
Iteration 3, loss = 0.71520901
Iteration 4, loss = 0.71000193
Iteration 5, loss = 0.70551453
Iteration 6, loss = 0.70166607
Iteration 7, loss = 0.69846347
Iteration 8, loss = 0.69585517
Iteration 9, loss = 0.69333768
Iteration 10, loss = 0.69129368
Iteration 11, loss = 0.68932792
Iteration 12, loss = 0.68754364
Iteration 13, loss = 0.68600142
Iteration 14, loss = 0.68448579
Iteration 15, loss = 0.68302996
Iteration 16, loss = 0.68163238
Iteration 17, loss = 0.68022221
Iteration 18, loss = 0.67886135
Iteration 19, loss = 0.67753846
Iteration 20, loss = 0.67620866
Iteration 21, loss = 0.67487575
Iteration 22, loss = 0.67357118
Iteration 23, loss = 0.67228299
Iteration 24, loss = 0.67099756
Iteration 25, loss = 0.66966173
Iteration 26, loss = 0.66836876
Iteration 27, loss = 0.66702959
Iteration 28, loss = 0.66570774
Iteration 29, loss = 0.66438754
Iteration 30, loss = 0.66303815
Iteration 31, loss = 0.66169114
Iteration 32, los

In [95]:
sk_clf.score(test_features, test_labels["label_identity"])

# Hell yeah, brother.

0.8498845265588915