In [1]:
import re
import pandas as pd
import nltk
from itertools import islice
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/doitclap/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load into environment as Pandas DF

In [2]:
reviews_as_table = pd.read_csv('IMDB Dataset.csv')
reviews_as_table.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Sampling

In [3]:
reviews_as_table = reviews_as_table.iloc[:1000,:]
reviews_as_table.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [4]:
def foo(x):
    x = re.sub(r'[^\x00-\x7f]', r'', x) # remove unwanted ascii
    x = x.lower() # set to lower
    x = "<s> " + x # add start token at start

    x = " ".join(x.split()) # remove consecutive spaces

    END_TOKENS = [".", "!", "?"]
    for char in END_TOKENS:
        x = x.replace(char + " ", char + " <s>") # add start token at the end of every end token
    x = x.replace("<br /><br />", "<br /><br /><s>") # add start token at the end of double line breaks
    x = x.replace("<s><br /><br /><s>", "<br /><br /><s>") # remove start tokens at the start of double line breaks
    x = x.replace("<s> <br /><br /><s>", "<br /><br /><s>") # remove unlikely case just for safety
    x = x.replace("<br />", " LINE_BREAK ").replace("<s>", " START_TOKEN ")

    return x

reviews_as_table["cleaned"] = reviews_as_table["review"].apply(foo)

reviews_as_table["cleaned"][1]

' START_TOKEN  a wonderful little production.  LINE_BREAK  LINE_BREAK  START_TOKEN the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.  LINE_BREAK  LINE_BREAK  START_TOKEN the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too!  START_TOKEN you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece.  START_TOKEN a masterful production about one of the great master\'s of comedy and his life.  LINE_BREAK  LINE_BREAK  START_TOKEN the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears.  START_TOKEN it plays on our knowledge and our senses, particularly with the scenes concerning ort

### Tokenization

In [5]:
reviews_as_table["tokenized"] = reviews_as_table["cleaned"].apply(lambda x: word_tokenize(x))
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

### obtaining `max_length`

In [6]:
counts = reviews_as_table["tokenized"].apply(lambda x: len(x))
max_length = counts.max()
max_length

1522

In [7]:
def foo(x):
    for _ in range(max_length - len(x)):
        x.append("NULL_TOKEN")
    return x

reviews_as_table["tokenized"] = reviews_as_table["tokenized"].apply(foo)
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

In [8]:
len(reviews_as_table["tokenized"][0])

1522

In [9]:
# curious to see what the longest review is, and if the start tokens were implemented properly

for index, row in reviews_as_table.iterrows():
    if 'NULL_TOKEN' not in row['tokenized']:
        longest = row['tokenized']
longest

['START_TOKEN',
 'i',
 'thought',
 'that',
 'rotj',
 'was',
 'clearly',
 'the',
 'best',
 'out',
 'of',
 'the',
 'three',
 'star',
 'wars',
 'movies',
 '.',
 'START_TOKEN',
 'i',
 'find',
 'it',
 'surprising',
 'that',
 'rotj',
 'is',
 'considered',
 'the',
 'weakest',
 'installment',
 'in',
 'the',
 'trilogy',
 'by',
 'many',
 'who',
 'have',
 'voted',
 '.',
 'START_TOKEN',
 'to',
 'me',
 'it',
 'seemed',
 'like',
 'rotj',
 'was',
 'the',
 'best',
 'because',
 'it',
 'had',
 'the',
 'most',
 'profound',
 'plot',
 ',',
 'the',
 'most',
 'suspense',
 ',',
 'surprises',
 ',',
 'most',
 'emotional',
 ',',
 '(',
 'especially',
 'the',
 'ending',
 ')',
 'and',
 'definitely',
 'the',
 'most',
 'episodic',
 'movie',
 '.',
 'START_TOKEN',
 'i',
 'personally',
 'like',
 'the',
 'empire',
 'strikes',
 'back',
 'a',
 'lot',
 'also',
 'but',
 'i',
 'think',
 'it',
 'is',
 'slightly',
 'less',
 'good',
 'than',
 'than',
 'rotj',
 'since',
 'it',
 'was',
 'slower-moving',
 ',',
 'was',
 'not',
 'as'

In [10]:
len(longest)

1522

In [11]:
longest[1000:]

['annakin',
 'skywalker',
 '.',
 'START_TOKEN',
 'it',
 'would',
 'have',
 'been',
 'disappointing',
 'if',
 'the',
 'movie',
 'had',
 'ended',
 'without',
 'luke',
 'getting',
 'to',
 'see',
 'his',
 'father',
 "'s",
 'face',
 'because',
 'it',
 'made',
 'it',
 'complete',
 '.',
 'START_TOKEN',
 'by',
 'annakin',
 "'s",
 'revelation',
 'it',
 'symbolized',
 'the',
 'transition',
 'darth',
 'vader',
 'underwent',
 'from',
 'being',
 'possessed',
 'by',
 'the',
 'dark',
 'side',
 '(',
 'in',
 'his',
 'helmet',
 ')',
 'and',
 'to',
 'the',
 'good',
 'person',
 'he',
 'was',
 'annakin',
 'skywalker',
 '(',
 'by',
 'removing',
 'the',
 'helmet',
 ')',
 '.',
 'START_TOKEN',
 'the',
 'point',
 'is',
 'that',
 'annakin',
 'died',
 'converted',
 'to',
 'the',
 'light',
 'side',
 'again',
 'and',
 'that',
 'is',
 'what',
 'the',
 'meaning',
 'of',
 'the',
 'helmet',
 'removal',
 'scene',
 'was',
 'about',
 '.',
 'START_TOKEN',
 'in',
 'fact',
 ',',
 'that',
 "'s",
 'is',
 'what',
 'i',
 'would'

### Vocab

In [12]:
vocab = set(reviews_as_table['tokenized'].explode().tolist())
for i, val in enumerate(islice(vocab, 10)):
    print(val)
    # do not print vocab in its entirety as it blows up the file size

bullying
premier
mystery
assure
conceivable
unflinching
turned
particular
looking
interpreters


In [13]:
idx_to_tkn = dict()
tkn_to_idx = dict()

for i, val in enumerate(vocab):
    idx_to_tkn[i] = val
    tkn_to_idx[val] = i

print(list(islice(idx_to_tkn.items(), 10)))
print(list(islice(tkn_to_idx.items(), 10)))

[(0, 'bullying'), (1, 'premier'), (2, 'mystery'), (3, 'assure'), (4, 'conceivable'), (5, 'unflinching'), (6, 'turned'), (7, 'particular'), (8, 'looking'), (9, 'interpreters')]
[('bullying', 0), ('premier', 1), ('mystery', 2), ('assure', 3), ('conceivable', 4), ('unflinching', 5), ('turned', 6), ('particular', 7), ('looking', 8), ('interpreters', 9)]


In [14]:
def foo():
    l = []

    def closure(x):
        nonlocal l
        if x['sentiment'] == 'positive':
            t = [1,0]
        else:
            t = [0,1]
        l2 = [tkn_to_idx[tkn] for tkn in x['tokenized']]
        l.append(l2 + t)
    return closure, l

bar, table = foo()
reviews_as_table.apply(bar, axis=1)

df = pd.DataFrame(table, columns=['x' + str(i) for i in range(max_length)] + ['y0', 'y1'])
df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,16873,4461,1358,16459,17069,15270,17018,17850,2645,11639,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
1,16873,6836,13769,14426,10141,17968,10774,10774,16873,16459,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
2,16873,4575,16785,14295,1042,6836,13769,1191,6913,8524,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
3,16873,5934,11631,19305,6836,2485,7785,6836,14426,6151,...,6951,6951,6951,6951,6951,6951,6951,6951,0,1
4,16873,17219,10203,19305,6372,66,4279,16459,19112,1358,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0


In [15]:
df.to_csv("dataset.csv")