In [11]:
import csv
import re

In [12]:
sms_data = []
sms_results = []

with open("SMSSpamCollection") as file:
    reader = csv.reader(file, delimiter="\t")
    for row in reader:
        sms_data.append(row[1])
        sms_results.append(True if row[0] == 'spam' else False)

In [13]:
sms_results[0]

False

In [14]:
sms_data[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [19]:
def longest_run_of_capitol_letters_feature(text):
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    longest = runs[-1]
    return [len(longest)]

for sms in sms_data[:10]:
    print(longest_run_of_capitol_letters_feature(sms))


[1]
[1]
[2]
[1]
[1]
[1]
[1]
[1]
[6]
[4]


In [23]:
def percent_periods_feature(text):
    """Return percentage of text that is periods compared to total text length."""
    periods = text.count(".")
    return [periods / len(text)]
    
for sms in sms_data[:10]:
    print(percent_periods_feature(sms))

[0.07207207207207207]
[0.20689655172413793]
[0.0064516129032258064]
[0.12244897959183673]
[0.0]
[0.006802721088435374]
[0.025974025974025976]
[0.00625]
[0.01910828025477707]
[0.0]


In [24]:
def feature_vector(text):
    return longest_run_of_capitol_letters_feature(text) + percent_periods_feature(text)

for sms in sms_data[:10]:
    print(feature_vector(sms))

[1, 0.07207207207207207]
[1, 0.20689655172413793]
[2, 0.0064516129032258064]
[1, 0.12244897959183673]
[1, 0.0]
[1, 0.006802721088435374]
[1, 0.025974025974025976]
[1, 0.00625]
[6, 0.01910828025477707]
[4, 0.0]


In [42]:
import numpy as np

class CustomFeaturizer:
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        """All SciKit-Learn compatible transformers and classifiers have the
        same interface. `fit` always returns the same object."""
        return self
        
    def transform(self, X):
        """Given a list of original data, return a list of feature vectors."""
        fvs = []
        for datum in X:
            fv = np.array([f(datum) for f in self.featurizers])
            fvs.append(fv.reshape(1, -1)[0])
        return np.array(fvs)
    
sms_featurizer = CustomFeaturizer(longest_run_of_capitol_letters_feature,
                                  percent_periods_feature)

print(sms_featurizer.transform(sms_data[:10]))

[[ 1.          0.07207207]
 [ 1.          0.20689655]
 [ 2.          0.00645161]
 [ 1.          0.12244898]
 [ 1.          0.        ]
 [ 1.          0.00680272]
 [ 1.          0.02597403]
 [ 1.          0.00625   ]
 [ 6.          0.01910828]
 [ 4.          0.        ]]
