In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [5]:
def wordCount(review): 
    try:
        return len(tokenizer.tokenize(review["reviewText"]))
    except:
        return 0
    
def posTag(review): 
    text = nltk.word_tokenize(review["reviewText"].lower())
    return nltk.FreqDist(tag for (word, tag) in nltk.pos_tag(text, tagset = "universal"))

def adjectives(review): 
    try:
        return review["pos_tag"]["ADJ"]
    except:
        return 0
    
def noun(review): 
    try:
        return review["pos_tag"]["NOUN"]
    except:
        return 0

def sentenceCount(review): 
    try:
        return len(nltk.sent_tokenize(review["reviewText"]))
    except:
        return 0
    
def unigramCount(review): 
    try:
        unigram_fd = nltk.FreqDist()
        unigram_fd.update(nltk.ngrams(tokenizer.tokenize(review["reviewText"]), 1))
        return len(unigram_fd)
    except:
        return 0

In [6]:
reviews_features = pd.read_csv('data/book_sample.csv.gz')
reviews_features['word_count'] = reviews_features.apply(wordCount,axis=1)
print 'word count'
reviews_features['sentence_count'] = reviews_features.apply(sentenceCount,axis=1)
print 'sentence_count'
reviews_features['unigram_count'] = reviews_features.apply(unigramCount,axis=1)
print 'unigram_count'
reviews_features['pos_tag'] = reviews_features.apply(posTag,axis=1)
print 'pos_tag'
reviews_features['adj'] = reviews_features.apply(adjectives,axis=1)
reviews_features['noun'] = reviews_features.apply(noun,axis=1)
reviews_features.shape

word count
sentence_count
unigram_count
pos_tag


(24234, 29)

In [7]:
df_columns = reviews_features.columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1.1',
       u'Unnamed: 0.1.1', u'asin', u'helpful', u'overall', u'reviewText',
       u'reviewTime', u'reviewerID', u'reviewerName', u'summary',
       u'unixReviewTime', u'helpfulness', u'tot', u'pageRank', u'hits',
       u'powerWithStar', u'word_count', u'pr_hs', u'pr_len', u'hs_len',
       u'pr_hs_len', u'sentence_count', u'unigram_count', u'pos_tag', u'adj',
       u'noun'],
      dtype='object')

In [8]:
reviews_features.to_csv('data/book_sample_counts.csv.gz', compression='gzip')