# Exploring binned continuous features for fastText

By @wejradford

## Installation

* Python bindings for fastText: https://github.com/facebookresearch/fastText/
* Extra python: `pip install scipy sklearn jupyter`
* YouTube comments dataset: https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection

## Read in the data from the YouTube Spam Collection

In [1]:
from collections import Counter
import csv
import datetime
import glob
import pprint
import random
import re
import statistics

import fastText
from sklearn.model_selection import KFold

# Be repeatable.
random.seed(1)

rows = []
for fname in glob.glob('YouTube-Spam-Collection-v1/*csv'):
    with open(fname) as f:
        rows.extend(list(csv.DictReader(f)))
random.shuffle(rows)
print('Read {} rows'.format(len(rows)))

Read 1956 rows


The data has a few columns:

In [2]:
pprint.pprint(rows[0])

OrderedDict([('COMMENT_ID', '_2viQ_Qnc68YBtosyTVGLy_Fs4YYXoiWd5-wKXnaIw4'),
             ('AUTHOR', 'celebsongspoof'),
             ('DATE', '2013-10-27T19:00:17.015000'),
             ('CONTENT',
              'Hey guys whats up? I found this app that lets you get free gift '
              'card vouchers like psn cards,X-box live cards and even amazon '
              'gift cards. For free! All you have to do is  simply just '
              'download the app from the app store. It is called juno wallet. '
              'All you have to do is just sign up for the app and then '
              'complete a few surveys or just download some other free apps '
              'and you get money like 10 cents. Also, if you type in the code '
              'IM2458444. You will also start off with $0.25 free!! '),
             ('CLASS', '1')])


We're going to try two ways to represent the data:
* Text only
* Text plus binned temporal features

In [3]:
# Simple whitespace tokeniser.
EXP = re.compile('\s+')


def write_data(data, fname, binned_features=False):
    labels = Counter()
    with open(fname, 'w') as f:
        for row in data:
            inst = {}            
            # Add the label.
            inst['label'] = int(row['CLASS'])
            
            # Base features are lower-cased tokens.
            tokens = EXP.split(row['CONTENT'].lower().strip())
            inst['text'] = ' '.join(tokens)
            
            
            # Optional binned time features.
            inst['binned'] = ''
            if binned_features:
                raw = row['DATE'].split('.')[0]
                if raw:
                    dt = datetime.datetime.strptime(raw, '%Y-%m-%dT%H:%M:%S')
                    # Represent the time by $type$value
                    # Separate "tokens" means n-grams will also trigger here (i.e. month+day, day+hour)
                    # One caveat here is that sub-token embeddings may also trigger :/
                    inst['binned'] = 'm{} d{} h{}'.format(dt.month, dt.day, dt.hour)

            # Write the partitioned data row.
            f.write('__label__{label} {text} _ _ {binned}\n'.format(**inst))
            labels[inst['label']] += 1
    return fname

Experiments are going to be cross-validation over the dataset, reporting P@1, R@1.

In [4]:
def run_experiment(rows, n=10, binned_features=False):
    """ Runs a 10-fold cross-validation experiment, printing precision and recall. """
    p_sample, r_sample = [], []
    for train_indices, test_indices in KFold(n_splits=n).split(rows):
        train = [rows[i] for i in train_indices]
        test = [rows[i] for i in test_indices]
        train_fname = write_data(train, 'train.txt', binned_features=binned_features)
        test_fname = write_data(test, 'test.txt', binned_features=binned_features)
        model = fastText.train_supervised(
            input=train_fname, 
            epoch=25, 
            lr=1.0, 
            wordNgrams=2, 
            verbose=2, 
            minCount=1,
        )
        _, p, r = model.test(test_fname)
        p_sample.append(p)
        r_sample.append(r)
    metrics = {
        'binned': binned_features,
        'p_mean': statistics.mean(p_sample),
        'p_95': 2 * statistics.stdev(p_sample),
        'r_mean': statistics.mean(r_sample),
        'r_95': 2 * statistics.stdev(r_sample),
    }
    print('{binned}\t{p_mean:.2f} ±{p_95:.2f}\t{p_mean:.2f} ±{p_95:.2f}'.format(**metrics))
    
print('Binned\tP\t\tR')
run_experiment(rows, 10, False)
run_experiment(rows, 10, True)

Binned	P		R
False	0.90 ±0.04	0.90 ±0.04
True	0.92 ±0.04	0.92 ±0.04


## Conclusion

Using binned temporal features seems to give a performance boost in this task, but not clear how statistically-significant it is. YMMV...