https://www.kaggle.com/aashita/nyt-comments#CommentsApril2017.csv

# Prepare Data

In [1]:
import pandas as pd
df_all  = pd.read_csv('data/nyt-comments/CommentsApril2017.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
len(df_all)

243832

In [3]:
df = df_all[['commentBody', 'commentID']].head()

In [4]:
df

Unnamed: 0,commentBody,commentID
0,This project makes me happy to be a 30+ year T...,22022598.0
1,Stunning photos and reportage. Infuriating tha...,22017350.0
2,Brilliant work from conception to execution. I...,22017334.0
3,NYT reporters should provide a contributor's l...,22015913.0
4,Could only have been done in print. Stunning.,22015466.0


# Chariot

In [5]:
import chariot.transformer as ct

## Text preprocessor

In [6]:
df['commentBodyNew'] = ct.text.SymbolFilter().transform(df['commentBody'])

In [7]:
df

Unnamed: 0,commentBody,commentID,commentBodyNew
0,This project makes me happy to be a 30+ year T...,22022598.0,This project makes me happy to be a 30 year T...
1,Stunning photos and reportage. Infuriating tha...,22017350.0,Stunning photos and reportage Infuriating tha...
2,Brilliant work from conception to execution. I...,22017334.0,Brilliant work from conception to execution I...
3,NYT reporters should provide a contributor's l...,22015913.0,NYT reporters should provide a contributor's l...
4,Could only have been done in print. Stunning.,22015466.0,Could only have been done in print Stunning


In [8]:
df['commentBody'][0]

'This project makes me happy to be a 30+ year Times subscriber... continue to innovate across all platforms, please.'

In [9]:
df['commentBodyNew'][0]

'This project makes me happy to be a 30  year Times subscriber    continue to innovate across all platforms  please '

## Tokenizer

If you confront the following error:
```
    Warning: no model found for 'en'

    Only loading the 'en' tokenizer.
```

the following command may help you.
```
python -m spacy.en.download all
```
https://stackoverflow.com/questions/43459437/spacy-link-error

In [10]:
df['commentBodyNew'] = ct.Tokenizer(lang="en").transform(df['commentBodyNew'])

In [11]:
df

Unnamed: 0,commentBody,commentID,commentBodyNew
0,This project makes me happy to be a 30+ year T...,22022598.0,"[<This:DET>, <project:NOUN>, <makes:VERB>, <me..."
1,Stunning photos and reportage. Infuriating tha...,22017350.0,"[<Stunning:VERB>, <photos:NOUN>, <and:CCONJ>, ..."
2,Brilliant work from conception to execution. I...,22017334.0,"[<Brilliant:ADJ>, <work:NOUN>, <from:ADP>, <co..."
3,NYT reporters should provide a contributor's l...,22015913.0,"[<NYT:PROPN>, <reporters:NOUN>, <should:VERB>,..."
4,Could only have been done in print. Stunning.,22015466.0,"[<Could:VERB>, <only:ADV>, <have:VERB>, <been:..."


In [12]:
df['commentBodyNew'][0]

[<This:DET>,
 <project:NOUN>,
 <makes:VERB>,
 <me:PRON>,
 <happy:ADJ>,
 <to:PART>,
 <be:VERB>,
 <a:DET>,
 <30:NUM>,
 < :SPACE>,
 <year:NOUN>,
 <Times:PROPN>,
 <subscriber:NOUN>,
 <   :SPACE>,
 <continue:VERB>,
 <to:PART>,
 <innovate:VERB>,
 <across:ADV>,
 <all:DET>,
 <platforms:NOUN>,
 < :SPACE>,
 <please:INTJ>]

## Token preprocessor

In [13]:
df['commentBodyNew'] = ct.token.StopwordFilter(lang="en").transform(df['commentBodyNew'])

In [14]:
df

Unnamed: 0,commentBody,commentID,commentBodyNew
0,This project makes me happy to be a 30+ year T...,22022598.0,"[<project:NOUN>, <makes:VERB>, <happy:ADJ>, <3..."
1,Stunning photos and reportage. Infuriating tha...,22017350.0,"[<Stunning:VERB>, <photos:NOUN>, <reportage:NO..."
2,Brilliant work from conception to execution. I...,22017334.0,"[<Brilliant:ADJ>, <work:NOUN>, <conception:NOU..."
3,NYT reporters should provide a contributor's l...,22015913.0,"[<NYT:PROPN>, <reporters:NOUN>, <provide:VERB>..."
4,Could only have been done in print. Stunning.,22015466.0,"[<print:NOUN>, < :SPACE>, <Stunning:VERB>, < :..."


In [15]:
df['commentBodyNew'][0]

[<project:NOUN>,
 <makes:VERB>,
 <happy:ADJ>,
 <30:NUM>,
 < :SPACE>,
 <year:NOUN>,
 <Times:PROPN>,
 <subscriber:NOUN>,
 <   :SPACE>,
 <continue:VERB>,
 <innovate:VERB>,
 <platforms:NOUN>,
 < :SPACE>]

## Define a Pipeline

In [16]:
import chariot.transformer as ct
from chariot.preprocessor import Preprocessor


preprocessor = Preprocessor()
preprocessor\
    .stack(ct.text.UnicodeNormalizer())\
    .stack(ct.Tokenizer("en"))\
    .stack(ct.token.StopwordFilter("en"))\
    .fit(df['commentBody'])

preprocessed = preprocessor.transform(df['commentBody'])

In [17]:
preprocessed

0    [<project:NOUN>, <makes:VERB>, <happy:ADJ>, <3...
1    [<Stunning:VERB>, <photos:NOUN>, <reportage:NO...
2    [<Brilliant:ADJ>, <work:NOUN>, <conception:NOU...
3    [<NYT:PROPN>, <reporters:NOUN>, <provide:VERB>...
4    [<print:NOUN>, <.:PUNCT>, <Stunning:VERB>, <.:...
dtype: object

In [18]:
df

Unnamed: 0,commentBody,commentID,commentBodyNew
0,This project makes me happy to be a 30+ year T...,22022598.0,"[<project:NOUN>, <makes:VERB>, <happy:ADJ>, <3..."
1,Stunning photos and reportage. Infuriating tha...,22017350.0,"[<Stunning:VERB>, <photos:NOUN>, <reportage:NO..."
2,Brilliant work from conception to execution. I...,22017334.0,"[<Brilliant:ADJ>, <work:NOUN>, <conception:NOU..."
3,NYT reporters should provide a contributor's l...,22015913.0,"[<NYT:PROPN>, <reporters:NOUN>, <provide:VERB>..."
4,Could only have been done in print. Stunning.,22015466.0,"[<print:NOUN>, < :SPACE>, <Stunning:VERB>, < :..."
