In [1]:
import numpy as np
import io
from collections import defaultdict
import pandas as pd

In [2]:
revs = []
vocab = defaultdict(float)
with io.open("data/rt-polarity.pos","r", encoding="latin-1") as f:
    for line in f:
        rev = []
        rev.append(line.strip())
        orig_rev = " ".join(rev).lower()
        words = set(orig_rev.split())
        for word in words:
            vocab[word] += 1
        datum  = {"y":1, 
                  "text": orig_rev,                             
                  "num_words": len(orig_rev.split()),
                  "split": np.random.randint(0,10)}
        revs.append(datum)
with io.open("data/rt-polarity.neg","r", encoding="latin-1") as f:
    for line in f:
        rev = []
        rev.append(line.strip())
        orig_rev = " ".join(rev).lower()
        words = set(orig_rev.split())
        for word in words:
            vocab[word] += 1
        datum  = {"y":0, 
                  "text": orig_rev,                             
                  "num_words": len(orig_rev.split()),
                  "split": np.random.randint(0,10)}
        revs.append(datum)

In [3]:
revs[0:10]

[{'y': 1,
  'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'num_words': 34,
  'split': 9},
 {'y': 1,
  'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
  'num_words': 39,
  'split': 8},
 {'y': 1,
  'text': 'effective but too-tepid biopic',
  'num_words': 4,
  'split': 7},
 {'y': 1,
  'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .',
  'num_words': 21,
  'split': 1},
 {'y': 1,
  'text': "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",
  'num_words': 21,
  'split': 3},
 {'y': 1,
  'text': 'the film provides some great insight into the neuroti

In [4]:
original_data = pd.DataFrame(revs)

In [5]:
original_data.to_csv("data/raw_data.csv")

In [6]:
original_data.head()

Unnamed: 0,num_words,split,text,y
0,34,9,the rock is destined to be the 21st century's ...,1
1,39,8,"the gorgeously elaborate continuation of "" the...",1
2,4,7,effective but too-tepid biopic,1
3,21,1,if you sometimes like to go to the movies to h...,1
4,21,3,"emerges as something rare , an issue movie tha...",1


In [7]:
original_data['char_count'] = original_data['text'].str.len()
original_data[['text','char_count']].head()

Unnamed: 0,text,char_count
0,the rock is destined to be the 21st century's ...,177
1,"the gorgeously elaborate continuation of "" the...",226
2,effective but too-tepid biopic,30
3,if you sometimes like to go to the movies to h...,89
4,"emerges as something rare , an issue movie tha...",111


In [8]:
# Number of stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

original_data['stopwords'] = original_data['text'].apply(lambda x: len([word for word in x.split() if word in stop]))
original_data[['text','stopwords']].head()


Unnamed: 0,text,stopwords
0,the rock is destined to be the 21st century's ...,11
1,"the gorgeously elaborate continuation of "" the...",11
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,10
4,"emerges as something rare , an issue movie tha...",7


In [9]:
# Number of numetics
original_data['numerics'] = original_data['text'].apply(lambda x: len([el for el in x.split() if el.isdigit()]))
original_data[['text','numerics']].head()


Unnamed: 0,text,numerics
0,the rock is destined to be the 21st century's ...,0
1,"the gorgeously elaborate continuation of "" the...",0
2,effective but too-tepid biopic,0
3,if you sometimes like to go to the movies to h...,0
4,"emerges as something rare , an issue movie tha...",0


## Basic Pre-processing

### Lower case

In [10]:
# convert to all lower case
original_data['text'] = original_data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
original_data['text'].head()

0    the rock is destined to be the 21st century's ...
1    the gorgeously elaborate continuation of " the...
2                       effective but too-tepid biopic
3    if you sometimes like to go to the movies to h...
4    emerges as something rare , an issue movie tha...
Name: text, dtype: object

### Removing Punctuation

In [11]:
original_data['text'] = original_data['text'].str.replace('[^\w\s]','')
original_data['text'].head()

0    the rock is destined to be the 21st centurys n...
1    the gorgeously elaborate continuation of  the ...
2                        effective but tootepid biopic
3    if you sometimes like to go to the movies to h...
4    emerges as something rare  an issue movie that...
Name: text, dtype: object

### Removal of Stop words

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
original_data['text'] = original_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
original_data['text'].head()

0    rock destined 21st centurys new conan hes goin...
1    gorgeously elaborate continuation lord rings t...
2                            effective tootepid biopic
3    sometimes like go movies fun wasabi good place...
4    emerges something rare issue movie thats hones...
Name: text, dtype: object

### Common word removal

In [13]:
freq = pd.Series(' '.join(original_data['text']).split()).value_counts()[:10]
freq

film      1452
movie     1271
one        727
like       721
story      477
much       386
even       382
good       377
comedy     359
time       341
dtype: int64

In [14]:
not_useful_word = ['.',',','film','movie','one']
original_data['text'] = original_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in not_useful_word))
original_data['text'].head()

0    rock destined 21st centurys new conan hes goin...
1    gorgeously elaborate continuation lord rings t...
2                            effective tootepid biopic
3    sometimes like go movies fun wasabi good place...
4    emerges something rare issue thats honest keen...
Name: text, dtype: object

### Rare words removal

In [15]:
freq = pd.Series(' '.join(original_data['text']).split()).value_counts()[-50:]
freq

inhospitability         1
icons                   1
affirmation             1
orquídeas               1
lamer                   1
jolted                  1
worldlywise             1
hymn                    1
stringpulling           1
drivers                 1
inbetween               1
playoff                 1
malkovichs              1
hoofing                 1
chais                   1
bebop                   1
breast                  1
thrillernoir            1
benefitted              1
chanting                1
highpowered             1
predictableadventure    1
1954                    1
meatballs               1
morgen                  1
midlevel                1
novice                  1
jiang                   1
eavesdropping           1
antidarwinian           1
lowdown                 1
gadzooks                1
juggling                1
triedandtrue            1
continuação             1
depravity               1
scalds                  1
melts                   1
milking     

In [16]:
freq = list(freq.index)
original_data['text'] = original_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
original_data.head()

Unnamed: 0,num_words,split,text,y,char_count,stopwords,numerics
0,34,9,rock destined 21st centurys new conan hes goin...,1,177,11,0
1,39,8,gorgeously elaborate continuation lord rings t...,1,226,11,0
2,4,7,effective tootepid biopic,1,30,1,0
3,21,1,sometimes like go movies fun wasabi good place...,1,89,10,0
4,21,3,emerges something rare issue thats honest keen...,1,111,7,0


## Lemmatization

In [17]:
from textblob import Word
original_data['text'] = original_data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
original_data['text'].head()

0    rock destined 21st century new conan he going ...
1    gorgeously elaborate continuation lord ring tr...
2                            effective tootepid biopic
3    sometimes like go movie fun wasabi good place ...
4    emerges something rare issue thats honest keen...
Name: text, dtype: object

In [18]:
original_data[['text','y','split']].to_csv("data/processed_data.csv")