In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

## Loading data

In [2]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset="train")

print("Number of text samples {}".format(len(data.data)))

Number of text samples 11314


In [3]:
data.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [4]:
import pandas as pd

df = pd.DataFrame()
df = df.assign(text=data["data"]).assign(target=data["target"])

In [5]:
df.head()

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


## Defining Cleaning Functions

### Remove Email Addresses

In [6]:
import re

test_text = data.data[0]

def remove_emails(text):
    '''
    Remove any email address in text with
    
    The `regex` can catch any number of email addresses in the text. Regex can be tried here: https://regex101.com/r/ZjgyLc/2

    '''
    regex =  r'\S*@\S*\s?'
    return re.sub(regex, '', text)

test_text = remove_emails(data.data[0])
print(test_text)

From: (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







### Remove Newline Characters

In [7]:
def remove_newlinechars(text):
    '''
    Substitute any newline chars with a whitespach
    
    The `regex` can be tried at: https://regex101.com/r/2fImPz/1/
    '''
    regex = r'\s+'
    return re.sub(regex, ' ', text)

test_text = remove_newlinechars(test_text)
print(test_text)

From: (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- 


### Tokenization

In [8]:
import nltk

def tokenize(text):
    '''
    Tokenize text
    '''
    tokens = nltk.word_tokenize(text)
    
    return list(
        filter(lambda word: word.isalnum(), tokens)
    )

test_text = tokenize(test_text)
print(test_text)

['From', 'where', 'my', 'thing', 'Subject', 'WHAT', 'car', 'is', 'this', 'Organization', 'University', 'of', 'Maryland', 'College', 'Park', 'Lines', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'I', 'saw', 'the', 'other', 'day', 'It', 'was', 'a', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', '70s', 'It', 'was', 'called', 'a', 'Bricklin', 'The', 'doors', 'were', 'really', 'small', 'In', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'This', 'is', 'all', 'I', 'know', 'If', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'Thanks', 'IL', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'Lerxst']


In [9]:
test_text = [e.lower() for e in test_text]

### Removing stopwords

In [10]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

## Add some common words from text
stop_words.extend(["from","subject","summary","keywords", "article"])

def remove_stopwords(words):
    '''
    Remove stop words from the list of words
    '''
    
    filtered = filter(lambda word: word not in stop_words, words)
    
    return list(filtered)

test_text = remove_stopwords(test_text)
print(test_text)

['thing', 'car', 'organization', 'university', 'maryland', 'college', 'park', 'lines', '15', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'sports', 'car', 'looked', 'late', 'early', '70s', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']


### Lemmatization

In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize(text, nlp=nlp):
    
    doc = nlp(" ".join(text))
    
    lemmatized = [token.lemma_ for token in doc]
    
    return lemmatized

test_text = lemmatize(test_text,nlp)
print(test_text)

['th', 'car', 'organization', 'university', 'maryland', 'college', 'park', 'line', '15', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'look', 'late', 'early', '70', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'whatev', 'info', 'funky', 'look', 'car', 'please', 'thank', 'il', 'bring', 'neighborhood', 'lerxst']


## Processing text in Dataframe

In [12]:
import time

t0 = time.time()
def clean_text(df):
    '''
    Take in a Dataframe, and process it
    '''
    df["cleaned_text"] = df.text.map(lambda text:text.lower()).map(remove_emails).map(remove_newlinechars).map(remove_stopwords).map(lemmatize)
    return df

df = clean_text(df)
t1 = time.time()
print("Time to process without Dask {}".format(t1-t0))

Time to process without Dask 258.16533493995667


## Processing text with Dask

### Creating dask dataframe

In [14]:
import dask.dataframe as ddf

dask_dataframe = ddf.from_pandas(df, npartitions=6)

In [15]:
t0 = time.time()
result = dask_dataframe.map_partitions(clean_text, meta=df)
df = result.compute()
t1 = time.time()
print("Time to process with Dask {}".format(t1-t0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Time to process with Dask 136.15019989013672
