In [1]:
import ipdb
import numpy as np
from collections import defaultdict, Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [2]:
def select_vectorizer(vectorizer: str, **kwargs):
    """
    Identify vectorizer used and return it to be used.

    :param vectorizer: Vectorizer to be used.
    :return v: Vectorizer function.
    """
    if not any(vec in vectorizer for vec in ['dict', 'count', 'hash', 'tfidf']):
        print("You need to select from the options: dict, count, hash, tfidf. Defaulting to Dict.")
        return DictVectorizer

    vect = vectorizer.lower(**kwargs)
    if 'dict' in vect:
        v = DictVectorizer(**kwargs)
    elif 'tfidf' in vect:
        v = TfidfVectorizer(**kwargs)
    elif 'hash' in vect:
        v = HashingVectorizer(**kwargs)
    elif 'count' in vect:
        v = CountVectorizer(**kwargs)
    setattr(v, 'fitted', False)

    return v


def vectorize(data, dataset, vect):
    """
    Vectorise documents.

    :dataset (data.GeneralDataset): Dataset object.
    :data (base.DataType): Dataset to vectorize.
    :vect (base.VectType): Vectorizer to use.
    :returns vectorized (base.DataType): Return vectorized dataset.
    """
    #data = [getattr(doc, getattr(f, 'name')) for f in dataset.train_fields for doc in data]

    if vect.fitted:
        vectorized = vect.transform(data)
    else:
        vect.fit(data)
        vectorized = vect.transform(data)
        vect.fitted = True
    return vectorized

In [6]:
def top_sklearn_features(model, dataset, vect):
    """
    Identify top features for scikit-learn model.

    :model (base.ModelType): Trained model to identify features for.
    :dataset (GeneralDataset): Dataset holding the label information.
    :vect (base.VectType): Fitted vectorizer.
    """
    if dataset == 2:
        coefs = binary_sklearn_features(model, dataset, vect)
    elif dataset > 2:
        coefs = multinomial_sklearn_features(model, dataset, vect)
    return coefs


def binary_sklearn_features(model, dataset, vect) -> dict:
    """
    Identify top features for binary scikit-learn model.

    :model (base.ModelType): Trained model to identify features for.
    :dataset (GeneralDataset): Dataset holding the label information.
    :vect (base.VectType): Fitted vectorizer.
    :return coefs (dict): Returns coefficient dictionary.
    """
    coefs = defaultdict(Counter())

    if 'RandomForest' in model.name:
        coefs[0].update({vect.feature_names_[f]: model.feature_importances_[f]
                         for f in np.argsort(model.feature_importances_)})
    elif 'SVM' in model.name:
        coefs[0].update({vect.feature_names_[v]: model.coef_[0, v] for v in range(model.coef_.shape[1])})
    elif 'LogisticRegression' in model.name:
        coefs[0].update({vect.feature_names_[f]: model.coef_[0, f] for f in np.argsort(model.coef_[0])})
    return coefs


def multinomial_sklearn_features(model, dataset, vect) -> dict:
    """
    Identify top features for multiclass scikit-learn model.

    :model (base.ModelType): Trained model to identify features for.
    :dataset (GeneralDataset): Dataset holding the label information.
    :vect (base.VectType): Fitted vectorizer.
    """

    coefs = defaultdict(Counter)
    if 'RandomForest' in model.name:
        ipdb.set_trace()
        for i, c in enumerate(range(dataset)):
            coefs[i].update({vect.feature_names_[f]: model.feature_importances_[f]
                             for f in np.argsort(model.feature_importances_)})
    elif 'SVM' in model.name:
        for i, c in enumerate(range(dataset)):
            coefs[i].update({vect.feature_names_[v]: model.coef_[i, v] for v in np.argsort(model.coef_[i])})
    elif 'LogisticRegression' in model.name:
        for i, c in enumerate(range(dataset)):
            coefs[i].update({vect.feature_names_[f]: model.coef_[i, f] for f in np.argsort(model.coef_[i])})

    return coefs

In [7]:
labels = ['noHate', 'noHate', 'someHate', 'someHate', 'noHate', 'noHate', 'someHate', 'someHate', 'someHate', 'noHate', 'someHate', 'noHate', 'someHate', 'noHate', 'someHate', 'noHate', 'noHate', 'noHate', 'noHate', 'noHate', 'hate', 'hate', 'hate', 'hate', 'hate', 'noHate', 'hate', 'noHate', 'hate', 'hate', 'hate', 'hate', 'hate']
text = ["In order to help increase the booklets downloads , it would be great if all Stormfronters who had YouTube accounts , could display the following text in the description boxes of their uploaded YouTube videos .", "Click below for a FREE download of a colorfully illustrated 132 page e-book on the Zionist-engineered INTENTIONAL destruction of Western civilization .", "She may or may not be a Jew but she 's certainly stupid , she seems to think the Blacks wo n't kill her alongside every other White they can get their dirty hands on , what a muppet !", "I think you should write a book as well .", "I guess I 'll just stick to homeschooling my kids , when and if I have them ...", "Their gods often had pale skin and blue eyes but I suppose literacy is n't valued highly by their cultural intelligencia .", "Greece need a man like Adolf Hitler and now is the perfect time to start a national political movement to free Your country from the zionist bankers", "He is a pathetic little chimp .", "com/watch ? v = kBD6Gm6VoDM Then watch 8 min. Obama at AIPAC @ youtube .", "Try El Paso Texas Soo much mudd invasion that the river is the same color as those playing/crossing in it .. year round !", "Welcome to the front lines ...",  "Here is a video of the event that is worth watching just to see a beautiful White lady screaming `` scum , scum , scum , f * * * ing scum !!! '' at the muslim filth .", "Nice to know that are still a few men out there who `` have a pair '' .", "I suppose the concept of a male cow and a female cow producing MORE cows , and the repeating process is too much for them to understand .", "May they all get mad cow disease .", "* Unsubscribed * Off to the SA threads .", "I'm sad to find this out right now but I 'm also sorry to inform you that Kevin was killed several years ago .", "Hundreds of people , but no blacks !", "Over and over , you hear about home-schooled kids beating out public school kids for spelling bees .", "The Rest Of The Story", "they act as if negros with aids should have the right to reproduce when they will die off and have no way to take care of the kid I cant believe that they are doing this by the millions its crazy", "I hope they all starve .", "No white person should send a single penny or lift a single finger to help them .", "He walks upright , and has human arms and legs .", "I 've seen negroes who look ( and undoubtedly act ) like him .", "Again, every time I will see filthy comment about Georgia without providing any facts , I will respond .", "I will respond until Armenian scum will stop", "The reason large white families were encouraged those days was to build up the population of white Germans , which were ravaged by the jewish attacks at them during WW1 and in hyperinflation 1920 's .", "Where a white woman can go about in public and not fear assault by muds .", "I retract - always .... Always because an animal lacks the skills to properly raise a ( half ) human child .", "When most whites today have so much white guilt laid upon them that they feel they have to act black or have sexual relations and children with blacks just to be accepted there is a problem with the white race memo .", "There should not be problems between the different classes inside the race white since we are all whites they should be joined to fight against non-whites they are our enemies .", "Black ``women''on London transport ..."]
print(len(labels), len(text))

33 33


In [8]:
unigrams = [tok for doc in text for tok in doc.split()]
labs = {lab: ix for ix, lab in enumerate(set(labels))}
ix_to_labs = {ix: lab for lab, ix in labs.items()}
print(labs)

{'someHate': 0, 'hate': 1, 'noHate': 2}


In [9]:
v = select_vectorizer('dict')
trainx = vectorize([Counter(doc.split()) for doc in text], None, v)
trainy = [labs[lab] for lab in labels]
print(trainy)

[2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1]


In [10]:
svm = SVC(kernel = 'linear')
svm.name = 'SVM'

In [11]:
svm.fit(trainx, trainy)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
w = top_sklearn_features(svm, len(labs), v)
print(w)

defaultdict(<class 'collections.Counter'>, {0: Counter({'!': 0.01608917019176184}), 1: Counter({'!': -0.11687256958277643}), 2: Counter({'!': -0.146313892265023})})


In [13]:
rfc = RandomForestClassifier()
rfc.fit(trainx, trainy)
rfc.name = "RandomForest"



In [14]:
w_clf = top_sklearn_features(rfc, len(labs), v)

> [0;32m<ipython-input-6-3a6b513d1d30>[0m(49)[0;36mmultinomial_sklearn_features[0;34m()[0m
[0;32m     48 [0;31m        [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 49 [0;31m        [0;32mfor[0m [0mi[0m[0;34m,[0m [0mc[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mrange[0m[0;34m([0m[0mdataset[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     50 [0;31m            coefs[i].update({vect.feature_names_[f]: model.feature_importances_[f]
[0m
ipdb> n
> [0;32m<ipython-input-6-3a6b513d1d30>[0m(50)[0;36mmultinomial_sklearn_features[0;34m()[0m
[0;32m     49 [0;31m        [0;32mfor[0m [0mi[0m[0;34m,[0m [0mc[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mrange[0m[0;34m([0m[0mdataset[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 50 [0;31m            coefs[i].update({vect.feature_names_[f]: model.feature_importances_[f]
[0m

In [15]:
w_clf

defaultdict(collections.Counter,
            {0: Counter({'!': 0.0,
                      'movement': 0.0,
                      'most': 0.0,
                      'min.': 0.0,
                      'millions': 0.0,
                      'men': 0.0,
                      'memo': 0.0,
                      'man': 0.0,
                      'male': 0.0,
                      'mad': 0.0,
                      'literacy': 0.0,
                      'like': 0.0,
                      'legs': 0.0,
                      'laid': 0.0,
                      'lady': 0.0,
                      'killed': 0.0,
                      'mudd': 0.0,
                      'muds': 0.0,
                      'muppet': 0.0,
                      'muslim': 0.0,
                      'pair': 0.0,
                      'page': 0.0,
                      'our': 0.0,
                      'other': 0.0,
                      'order': 0.0,
                      'on': 0.0,
                      'often': 0.0,
       