In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.neural_network as sk_nn
import sklearn.tree as sk_tree
from sklearn.decomposition import PCA
import tensorflow as tf
from collections import Counter
import json
import pickle

In [2]:
color_dict_long = { # We want to make colorless its own color. This labelling system makes all color combos distinct, which will be more difficult for the classifier
    (): 0, # Colorless
    ("W",): 1, # White
    ("U",): 2, # Blue
    ("B",): 3, # Black
    ("R",): 4, # Red
    ("G",): 5, # Green
    ("G", "W"): 6, # Selesnya
    ("U", "W"): 7, # Azorius
    ("B", "U"): 8, # Dimir
    ("B", "R"): 9, # Rakdos
    ("G", "R"): 10, # Gruul
    ("B", "G"): 11, # Golgari
    ("B", "W"): 12, # Orzhov
    ("G", "U"): 13, # Simic
    ("R", "U"): 14, # Izzet
    ("R", "W"): 15, # Boros
    ("B", "G", "R"): 16,  # Jund
    ("B", "G", "U"): 17,  # Sultai
    ("B", "G", "W"): 18,  # Abzan
    ("B", "R", "U"): 19,  # Grixis
    ("B", "R", "W"): 20,  # Mardu
    ("B", "U", "W"): 21,  # Esper
    ("G", "R", "U"): 22,  # Temur
    ("G", "R", "W"): 23,  # Naya
    ("G", "U", "W"): 24,  # Bant
    ("R", "U", "W"): 25,  # Jeskai
    ("B", "G", "R", "U"): 26, # Whiteless (Yidris)
    ("B", "G", "R", "W"): 27, # Blueless (Saskia)
    ("B", "G", "U", "W"): 28, # Redless (Atraxa)
    ("B", "R", "U", "W"): 29, # Greenless (Breya)
    ("G", "R", "U", "W"): 30, # Blackless (Aragorn)
    ("B", "G", "R", "U", "W"): 31 # 5-Color
}

color_dict_short = {
    "": 0, # Colorless
    "W": 1, # White
    "U": 2, # Blue
    "B": 3, # Black
    "R": 4, # Red
    "G": 5, # Green
}

In [3]:
with open("./common_types.txt", "rb") as f:
       common_types = pickle.load(f)

cols_include_all = ['colorIdentity', 'colors', 'firstPrinting', 'keywords', 'manaCost', 'manaValue',
                     'subtypes', 'supertypes', 'text', 'type', 'types', 'power', 'toughness',  'colorIndicator',
                     'name', 'hasAlternativeDeckLimit']
cols_include_noncreature = ['loyalty',]
cols_edhrec = ["edhrecRank", "edhrecSaltiness"]
cols_legality = ['legalities.commander', 'legalities.duel', 'legalities.explorer',
       'legalities.historic', 'legalities.historicbrawl', 'legalities.legacy',
       'legalities.modern', 'legalities.oathbreaker', 'legalities.pauper',
       'legalities.paupercommander', 'legalities.penny', 'legalities.pioneer',
       'legalities.vintage', 'legalities.gladiator','legalities.alchemy',
       'legalities.brawl', 'legalities.future', 'legalities.standard', 'legalities.predh',
       'legalities.premodern', 'legalities.oldschool',]
cols_leadership = ['leadershipSkills.brawl',
       'leadershipSkills.commander', 'leadershipSkills.oathbreaker',]

def to_feature_name(s: str, typ: bool = False) -> str:
    if typ:
        return "f_ct_" + s.lower().replace(" ", "_")
    return "f_kw_" + s.lower().replace(" ", "_")

def get_kw_list(filename: str):
    """Get list of keywords from file"""
    with open(filename+".json") as f:
        json_data = json.load(f)
    data = json_data["data"]
    ability_words = data["abilityWords"]
    kw_abilities = data["keywordAbilities"]
    kw_actions = data["keywordActions"]
    all_kws = ability_words + kw_abilities + kw_actions
    return all_kws, ability_words, kw_abilities, kw_actions

def make_types_list(df: pd.DataFrame, n: int) -> None:
    """From a complete dataset, write a list of the 200 most common creature types to a file called common_types.txt"""
    all_types = []
    df["subtypes"].apply(all_types.extend)
    all_types = Counter(all_types)
    common_types = [x for x,y in all_types.most_common(n)]
    with open("./common_types.txt", "wb") as f:
        pickle.dump(common_types, f)

# make_types_list(df, 50)

# all_kws, _, _, _ = get_kw_list("../data/mtg/Keywords")
all_kws, _, _, _ = get_kw_list("./short_keywords")




In [48]:
def load_atomic(filename: str) -> pd.DataFrame:
    """Load from the Atomic standard files into a dataframe resembling the old data standard."""
    with open("../data/mtg/"+filename+".json") as f:
        json_data = json.load(f)  # Load from file
    json_data = json_data["data"]
    cards = [x[0] for x in json_data.values() if len(x) == 1] # Pull only cards with 1 face (no transform, fuse, split, flip cards, sorry Delver)
    df = pd.json_normalize(cards)
    return df

def prep_df(df: pd.DataFrame, monocolor: bool, creatures: bool, modern: bool, small: bool = False) -> pd.DataFrame:
    """
    Preprocesses card DF
    @param df: Input DataFrame
    @param monocolor: If true, return only cards with 1 or less color
    @param creatures: If true, return only creatures
    @param modern: If true, filter by modern legality (excludes Uro :'( )
    """
    df["num_colors"] = df["colorIdentity"].map(len)
    if creatures:
        df = df.loc[df['type'].str.contains('Creature')]
    if monocolor:
        df = df.loc[df['num_colors'] <= 1]
    if modern:
        df = df.loc[df["legalities.modern"] == "Legal"]
    df = df[cols_include_all]
    df["f_is_artifact"] = df["supertypes"].apply(lambda x: 1 if "Artifact" in x else 0)
    df["f_is_enchantment"] = df["supertypes"].apply(lambda x: 1 if "Enchantment" in x else 0)
    df["f_cmc"] = (df["manaValue"] / 7.5) - 1  # [0,15] -> [-1, 1]
    df['f_pow'] = df['power'].replace({"1+*": 1, "*": "0", "*+1": 1}) # Assume all *'s are 0 (as per the rules)
    df['f_pow'] = ((df['f_pow'].astype(int) + 1) / 9) - 1 # [-1,16] -> [-1, 1]
    df['f_tough'] = df['toughness'].replace({"1+*": 1, "*": "0", "*+1": 1}) # Assume all *'s are 0 (as per the rules)
    df['f_tough'] = ((df['f_tough'].astype(int) + 1) / 9) - 1 # [-1,16] -> [-1, 1]

    df["label_identity"] = df["colorIdentity"].apply(lambda x: color_dict_long[tuple(x)]) # Could use 'colors' instead, but Kenrith should be classified as a 5C card, and Tasigur as Sultai.
    df["label_white"] = df["colorIdentity"].apply(lambda x: 1 if "W" in x else 0)
    df["label_blue"] = df["colorIdentity"].apply(lambda x: 1 if "U" in x else 0)
    df["label_black"] = df["colorIdentity"].apply(lambda x: 1 if "B" in x else 0)
    df["label_red"] = df["colorIdentity"].apply(lambda x: 1 if "R" in x else 0)
    df["label_green"] = df["colorIdentity"].apply(lambda x: 1 if "G" in x else 0)
    df["label_colorless"] = df["colorIdentity"].apply(lambda x: 1 if len(x) == 0 else 0)

    # Binary columns for types and keywords
    for kw in all_kws:
        feature = to_feature_name(kw)
        df[feature] = df["text"].str.contains(kw, case=False, na=0).astype(int)  # This works, but Death's Shadow counts as a creature with Shadow. Could look into using reminder text?
    for typ in common_types:
        feature = to_feature_name(typ, True)
        df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)
    df = df.set_index("name")
    return df

df = prep_df(load_atomic("ModernAtomic"), monocolor=True, creatures=True, modern=True)




  df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)
  df[feature] = df["subtypes"].apply(lambda x: 1 if typ in x else 0)


In [49]:
#ML Preprocessing

features = [x for x in list(df.columns) if x.startswith("f_")]
labels = [x for x in list(df.columns) if x.startswith("label_")]

ml_df = df[features + labels]

sk_scaler = sk.preprocessing.StandardScaler().fit(ml_df)

train, test = sk.model_selection.train_test_split(ml_df, test_size = 0.15)
train_features, train_labels = train[features], train[labels]
test_features, test_labels = test[features], test[labels]


In [25]:
# SKLearn MLP on mono-colored creatures
sk_clf = sk_nn.MLPClassifier(solver='sgd', hidden_layer_sizes=(10, 6), max_iter = 20000, verbose= True)

sk_clf.fit(train_features, train_labels["label_identity"])

Iteration 1, loss = 1.79947357
Iteration 2, loss = 1.79372601
Iteration 3, loss = 1.78804907
Iteration 4, loss = 1.78333189
Iteration 5, loss = 1.77921896
Iteration 6, loss = 1.77542920
Iteration 7, loss = 1.77216664
Iteration 8, loss = 1.76909050
Iteration 9, loss = 1.76634313
Iteration 10, loss = 1.76380152
Iteration 11, loss = 1.76140850
Iteration 12, loss = 1.75918918
Iteration 13, loss = 1.75717101
Iteration 14, loss = 1.75552121
Iteration 15, loss = 1.75399338
Iteration 16, loss = 1.75253265
Iteration 17, loss = 1.75116176
Iteration 18, loss = 1.74987166
Iteration 19, loss = 1.74869793
Iteration 20, loss = 1.74775343
Iteration 21, loss = 1.74671670
Iteration 22, loss = 1.74583379
Iteration 23, loss = 1.74500564
Iteration 24, loss = 1.74434854
Iteration 25, loss = 1.74378105
Iteration 26, loss = 1.74315970
Iteration 27, loss = 1.74260566
Iteration 28, loss = 1.74207702
Iteration 29, loss = 1.74164854
Iteration 30, loss = 1.74121297
Iteration 31, loss = 1.74076728
Iteration 32, los

In [26]:
sk_clf.score(test_features, test_labels["label_identity"])

0.20308483290488433

MAIN GOAL: Determine a creature's color identity based on: (number of features)
- CMC (1)
- Power (1)
- Toughness (1)
- Artifact / Enchantment Supertype (2)
- Type (boolean cols for each of the top 200 tribes) (200)
- Keywords (see keywords.json and list of evergreen keywords on https://mtg.fandom.com/wiki/Evergreen) (20-200)
- Name? (Would need a way to break this down (https://web.stanford.edu/group/pdplab/pdphandbook/handbookch8.html))


In [80]:
evergreen_keywords = ["Activate", "Attach", "Cast", "Counter", "Create", "Destroy", "Discard", "Exchange", "Exile", "Fight",
                       "Mill", "Play", "Reveal", "Sacrifice", "Scry", "Search", "Shuffle", "Tap", "Untap"]
my_common_words = ["Enchantment", "Artifact", "+1/+1", "Token", "Draw" "Land", "Nonland", "Spell", "Creature",]
evergreen_abilities = ["Deathtouch", "Defender", "Double Strike", "Enchant", "Equip", "First Strike", "Flash", "Flying",
                        "Haste", "Hexproof", "Indestructible", "Lifelink", "Menace", "Protection", "Reach", "Trample",
                          "Vigilance", "Ward", "Regenerate", "Shroud", "Intimidate", "Prowess"]
all_keywords = evergreen_keywords + my_common_words + evergreen_abilities

# Issue #1: Multi-faced cards from the Atomic dataset.
The more robust atmoic dataset contains split entries for DFC's, fuse cards, etc. How do we count these cards?
A. Remove them from the dataset.
    # By far the easiest approach.
B. Look at just the front.
    # Cleanest, will cause some outliers, namely on meld/fuse/transform cards
C. Add them as an additional row.
    # More accurate, but will likely be outliers
D. Add extra columns
    # Most accurate, but will mess with any ML algo if not weighted properly.

# Issue #2: Keywords Overlap with Creature Names (Death's Shadow, Flying Men)
This is mostly fine. For one thing, many of these cards match color identity with their mechanic (Flying men are blue, DS is black, etc.)
We could look into some kind of way to differentiate based on regex or pattern matching, but let's leave that for now.   

# Issue 3: The word Counter
So, despite being able to name things whatever they want, and repeated oracle changes to simplify and clarify wording, MTG still uses the word 'counter' to mean two different things: a keyword action meaning "Remove this spell or ability from the stack", and a board object placed on permanents i.e. +1/+1 counters, loyalty counters. Countering things (first interpretation) is a blue-coded mechanic, while counters (second interpretation) are a fairly universal mechanic, maybe leaning white and green but with no real identity. Again, we could look into differentiating these by string matching ("counter target" vs "+1/+1 counter"), but since counters don't really have an identity, we're changing "counter" in keyword abilities to "counter target". There are also way too many strings about the other kind of counter to simply process (counter vs counters, etc.). This does remove Baral, the bluest creature ever, from the counter keyword column, but whatever.

# Issue 4: Parsing Card Names
This might be an entirely different ML task. There are packages to determine semantic vectors of words. Use the names of each card (sum of all word vectors) as a feature set. A card like Death's Shadow would be easy peasy. Brushwagg, on the other hand, maybe not. Proper names like Olivia Voldaren, or worse, Drizz't Dourden, would be all but impossible.

# Issue 5: Improving Performance
Current accuracy for testing dataset sits at about 64%- not great for a binary classifier, but pretty good for a classifier with 6 classes (chance rate of 17%). Let's try running a binary classifier on enemy-colors cards, say blue and green (probably the most disparate colors in terms of creatures).

# Issue 6: PCA
PCA didn't improve the performance of the neural net, as expected (59.1% w/ PCA and 64% without). It also reduced performance on the decision tree (45%  with and 54% without). This makes sense, since we lose information with PCA.

In [27]:
simic_df = df.loc[df["colorIdentity"].isin([["G"], ["U"]])]
simic_df.head(10)

# SKLearn MLP on U and G creatures
sk_clf = sk_nn.MLPClassifier(solver='sgd', hidden_layer_sizes=(10, 6), max_iter = 10000, verbose= True)

features = [x for x in list(simic_df.columns) if x.startswith("f_")]
labels = [x for x in list(simic_df.columns) if x.startswith("label_")]

ml_df = simic_df[features + labels]

sk_scaler = sk.preprocessing.StandardScaler().fit(ml_df)

train, test = sk.model_selection.train_test_split(ml_df, test_size = 0.15)
train_features, train_labels = train[features], train[labels]
test_features, test_labels = test[features], test[labels]

sk_clf.fit(train_features, train_labels["label_identity"])

Iteration 1, loss = 0.68844145
Iteration 2, loss = 0.68782144
Iteration 3, loss = 0.68724528
Iteration 4, loss = 0.68658266
Iteration 5, loss = 0.68610602
Iteration 6, loss = 0.68576749
Iteration 7, loss = 0.68549381
Iteration 8, loss = 0.68528961
Iteration 9, loss = 0.68513522
Iteration 10, loss = 0.68499342
Iteration 11, loss = 0.68490886
Iteration 12, loss = 0.68480974
Iteration 13, loss = 0.68477507
Iteration 14, loss = 0.68473257
Iteration 15, loss = 0.68464794
Iteration 16, loss = 0.68458985
Iteration 17, loss = 0.68453594
Iteration 18, loss = 0.68448767
Iteration 19, loss = 0.68444375
Iteration 20, loss = 0.68439648
Iteration 21, loss = 0.68435626
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [28]:
sk_clf.score(test_features, test_labels["label_identity"])

# Hell yeah, brother. 85% on a binary

0.5242494226327945

In [33]:
tree_clf = sk_tree.DecisionTreeClassifier()
tree_clf.fit(train_features, train_labels["label_identity"])

In [30]:
train_features.sample(1).transpose()

name,Inexorable Blob
f_is_artifact,0.0
f_is_enchantment,0.0
f_cmc,-0.6
f_pow,-0.555556
f_tough,-0.555556


In [34]:
tree_clf.score(test_features, test_labels["label_identity"])

# 54.5% accuracy on a tree. Worse than a neural net. I worry the massive amount of type and keyword columns is messing with it.
# Removing types and keywords leaves us with 26% accuracy. Not good, but better than neural net with this shrunk feature space (20%)

0.2639245929734362

In [66]:
# I forgot about dimensionality reduction!
# Let's do some basic-ass PCA on the larger dataset.


pca = PCA(n_components = 25)
pca_df = pd.DataFrame(pca.fit_transform(df[features]))
pca_df.index = df.index
pca_df["label_identity"] = df["label_identity"]

train, test = sk.model_selection.train_test_split(pca_df, test_size = 0.15)
train_features, train_labels = train[range(0,25)], train["label_identity"]
test_features, test_labels = test[range(0,25)], test["label_identity"]


In [69]:
# SKLearn MLP on mono-colored creatures
sk_clf = sk_tree.DecisionTreeClassifier()

sk_clf.fit(train_features, train_labels)

sk_clf.score(test_features, test_labels)

# 59% accuracy after PCA.


# Next time: Random forest decision tree, then exploring the best tree to find insights into the color pie
# Also next time: using larger and smaller datasets (AtomicCards, LegacyAtomic, StandardAtomic)

0.46358183376178236