In [1]:
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import RegexpTokenizer

In [29]:
def fetch_raw_data(categories, remove):
    train = fetch_20newsgroups(data_home='./data/',
                               subset='train',
                               categories=categories,
                               remove=remove)
    test = fetch_20newsgroups(data_home='./data/',
                              subset='test',
                              categories=categories,
                              remove=remove)
    X_train, y_train = train.data, train.target
    X_test, y_test = test.data, test.target

    return X_train, y_train, X_test, y_test

def clean_head_foot(docs):
    '''
    Remove header lines except 'Subject' and 'Organization'
    '''
    clean_docs = []
    for doc in docs:
        head, split, tail = doc.partition('\n\n')

        # clean head
        clean_head = '\n'.join([line.strip().split(':')[-1]
                                for line in head.strip().split('\n')
                                if line.strip().split(':')[0]
                                in ('Subject', 'Organization')])

        # remove foot
        splited_tail = tail.strip().split('\n')
        for i in range(len(splited_tail) - 1, -1, -1):
            if splited_tail[i] == '' or \
                    splited_tail[i].strip('-') == '' or \
                    splited_tail[i].strip('=') == '' or \
                    splited_tail[i].strip('#') == '' or \
                    splited_tail[i].strip('*') == '' or \
                    splited_tail[i].strip('\n') == '':
                break
        clean_tail = '\n'.join(splited_tail[:i])

        clean_doc = clean_head + '\n' + clean_tail
        clean_docs.append(clean_doc)

    return clean_docs

In [4]:
X_train, y_train, X_test, y_test = fetch_raw_data(None, ())

In [30]:
X_train = clean_head_foot(X_train)

In [33]:
doc = X_train[-1]
print(doc)

 stolen CBR900RR
 California Institute of Technology, Pasadena
Stolen from Pasadena between 4:30 and 6:30 pm on 4/15.

Blue and white Honda CBR900RR california plate KG CBR.   Serial number
JH2SC281XPM100187, engine number 2101240.

No turn signals or mirrors, lights taped over for track riders session
at Willow Springs tomorrow.  Guess I'll miss it.  :-(((

Help me find my baby!!!


In [41]:
tokenizer = RegexpTokenizer('[\w\']+')
tokens = tokenizer.tokenize(doc)
tokens

['stolen',
 'CBR900RR',
 'California',
 'Institute',
 'of',
 'Technology',
 'Pasadena',
 'Stolen',
 'from',
 'Pasadena',
 'between',
 '4',
 '30',
 'and',
 '6',
 '30',
 'pm',
 'on',
 '4',
 '15',
 'Blue',
 'and',
 'white',
 'Honda',
 'CBR900RR',
 'california',
 'plate',
 'KG',
 'CBR',
 'Serial',
 'number',
 'JH2SC281XPM100187',
 'engine',
 'number',
 '2101240',
 'No',
 'turn',
 'signals',
 'or',
 'mirrors',
 'lights',
 'taped',
 'over',
 'for',
 'track',
 'riders',
 'session',
 'at',
 'Willow',
 'Springs',
 'tomorrow',
 'Guess',
 "I'll",
 'miss',
 'it',
 'Help',
 'me',
 'find',
 'my',
 'baby']