In [2]:
import csv
import re
import numpy as np
import random

from collections import Counter

from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def longest_run_of_capitol_letters_feature(char, text):
    """Find the longest run of capitol letters and return their length."""
    if char == '~':
        runs = sorted(re.findall(r"~+", text), key=len)
    elif char == '.':
        runs = sorted(re.findall(r"\.+", text), key=len)
    elif char == '|':
        runs = sorted(re.findall(r"\|+", text), key=len)
    elif char == ':':
        runs = sorted(re.findall(r"\:+", text), key=len)
    elif char == ';':
        runs = sorted(re.findall(r";+", text), key=len)
    elif char == '$':
        runs = sorted(re.findall(r"\$+", text), key=len)
    elif char == '(':
        runs = sorted(re.findall(r"\(+", text), key=len)
    elif char == ')':
        runs = sorted(re.findall(r"\)+", text), key=len)
    elif char == '-':
        runs = sorted(re.findall(r"\-+", text), key=len)
    if runs:
        return len(runs[-1])
    else:
        return 0

def longest_run_of_character_feature(text):
    """Find the longest run of characters and return their length."""
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    if runs:
        return len(runs[-1])
    else:
        return 0   
    
def percent_character_feature(char, text):
    """Return percentage of text that is a particular char compared to total text length."""
    def feature_fn(text):
        periods = text.count(char)
        return periods / len(text)
    return feature_fn

class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        """All SciKit-Learn compatible transformers and classifiers have the
        same interface. `fit` always returns the same object."""
        return self
        
    def transform(self, X):
        """Given a list of original data, return a list of feature vectors."""
        fvs = []
        for datum in X:
            fv = [f(datum) for f in self.featurizers]
            fvs.append(fv)
        return np.array(fvs)