In [1]:
import nltk
import re
from collections import Counter
import pandas as pd
#nltk.download("movie_reviews")
from nltk.corpus import movie_reviews as mr
from nltk.metrics.distance import edit_distance
import numpy as np

In [2]:
print(len(mr.words()), len(mr.sents()), len(mr.paras()))

1583820 65258 2000


In [3]:
mr.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [4]:
# put the sentences in a dataframe
list_of_sentences = []
for list_of_words in mr.sents():
    sentence = ' '.join(list_of_words)
    list_of_sentences.append({'sentence':re.sub('\W+', ' ', sentence).strip(), 'sentence_no_spaces':re.sub('\W+', '', sentence)})

df = pd.DataFrame(list_of_sentences)

In [5]:
df

Unnamed: 0,sentence,sentence_no_spaces
0,plot two teen couples go to a church party dri...,plottwoteencouplesgotoachurchpartydrinkandthen...
1,they get into an accident,theygetintoanaccident
2,one of the guys dies but his girlfriend contin...,oneoftheguysdiesbuthisgirlfriendcontinuestosee...
3,what s the deal,whatsthedeal
4,watch the movie and sorta find out,watchthemovieandsortafindout
...,...,...
65253,it s a quick straight shot to the movie s end,itsaquickstraightshottothemoviesend
65254,in terms of overall quality i would compare th...,intermsofoverallqualityiwouldcomparethetrumans...
65255,both films are well made with interesting stor...,bothfilmsarewellmadewithinterestingstoriesseti...
65256,but neither film really felt like it capitaliz...,butneitherfilmreallyfeltlikeitcapitalizedonall...


In [6]:
%%time
# build a dictionay

count_words = Counter()

# count everything
for sentence in df.sentence:
    for word in sentence.split(' '):
        count_words[word] += 1
                               

CPU times: total: 406 ms
Wall time: 409 ms


In [7]:
len(count_words)

39697

In [8]:
count_words.most_common(20)

[('the', 76529),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('it', 16107),
 ('that', 15924),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961),
 ('his', 9587),
 ('this', 9578),
 ('film', 9517),
 ('i', 8889),
 ('he', 8864),
 ('but', 8634),
 ('on', 7385)]

In [9]:
%%time
# make a list of frequent enough words
bag_of_words = set()
for item in count_words.elements():
    if count_words[item] >= 5:
        bag_of_words.add(item)

CPU times: total: 203 ms
Wall time: 201 ms


In [10]:
len(bag_of_words)

14759

# Implementing the maximum matching algorithm

In [11]:
#go over sentences without spaces - look for the longest prefix
MAX_WORD_LEN = 15

def sentence_segmentation(sentence):
    new_sentence = ''
    from_index = 0
    prefix_len = MAX_WORD_LEN
    while from_index < len(sentence):
        to_index = min(from_index + prefix_len, len(sentence))
        if sentence[from_index:to_index] in bag_of_words:
            new_sentence += sentence[from_index:to_index] + ' '
            from_index = to_index
            prefix_len = MAX_WORD_LEN
        else:
            prefix_len -= 1
            if prefix_len == 0:
                prefix_len = MAX_WORD_LEN
                from_index += 1
        
    return new_sentence.strip()



In [12]:
print(df.sentence[10])
print(df.sentence_no_spaces[10])
sentence_segmentation(df.sentence_no_spaces[10])

it starts off normal but then downshifts into this fantasy world in which you as an audience member have no idea what s going on
itstartsoffnormalbutthendownshiftsintothisfantasyworldinwhichyouasanaudiencememberhavenoideawhatsgoingon


'its t arts off normal butt hen downs hi f t sin to this fantasy world in which you as an audience member haven o idea what s going on'

In [13]:
%%time
# add reconstructed sentences to the df
new_sentences = []
for sentence in df.sentence_no_spaces:
    new_sentences.append(sentence_segmentation(sentence))
   
    

CPU times: total: 6.83 s
Wall time: 6.84 s


In [14]:
df['sentenced_restored'] = new_sentences

In [15]:
df.sample(20)

Unnamed: 0,sentence,sentence_no_spaces,sentenced_restored
17383,a huge success,ahugesuccess,ah u g e success
50351,the character development in this movie was no...,thecharacterdevelopmentinthismoviewasnotalltha...,the character development in this movie wasn o...
15945,the aliens are tall pasty faced and bald and w...,thealiensaretallpastyfacedandbaldandwearlongbl...,the aliens are tall past y faced and bald and ...
59064,in december of 1979 a comedian named andy kauf...,indecemberof1979acomediannamedandykaufmanputon...,in december of 1979 a comedian named andy kauf...
34246,an acerbic sing song metered voice that evokes...,anacerbicsingsongmeteredvoicethatevokesthestar...,an acerbic sings on g meter ed voice that evok...
885,with 102 dalmatians the disney studios have pr...,with102dalmatiansthedisneystudioshaveproventha...,with 102 dalmatians the disney studios have pr...
60112,,,
64709,strangely enough i think that fans of the seri...,strangelyenoughithinkthatfansoftheserieswillha...,strangely enough it hi n k that fans of these ...
30545,ebert s re election slogan is not surprisingly...,ebertsreelectionsloganisnotsurprisinglythumbsu...,ebert s reel e c t i on slo g an isn o t surpr...
14977,super mario bros street fighter and mortal kom...,supermariobrosstreetfighterandmortalkombatwere...,super mario bros street fighter and mortal kom...


In [16]:
# how well did we perform?

In [17]:
df.iloc[26627]

sentence              was i grossed out
sentence_no_spaces       wasigrossedout
sentenced_restored    was i grossed out
Name: 26627, dtype: object

In [18]:
# how many correct resconstructions?

In [19]:
sum(df['sentenced_restored'] == df['sentence'])

11427

In [20]:
len(df)

65258

In [21]:
sum(df['sentenced_restored'] == df['sentence']) / len(df)

0.1751049679732753

In [22]:
# assign a normalized edit distance

In [23]:
df.iloc[26622]


sentence              sooner or later people start to tell themselve...
sentence_no_spaces    soonerorlaterpeoplestarttotellthemselvesthatit...
sentenced_restored    sooner or later peoples t art to tell themselv...
Name: 26622, dtype: object

In [24]:
edit_distance(df.iloc[26622].sentence, df.iloc[26622].sentenced_restored)

7

In [25]:
%%time
df['edit_distance'] = df.apply(lambda row: edit_distance(row.sentence.split(' '), row.sentenced_restored.split(' ')), axis=1)

CPU times: total: 40.5 s
Wall time: 40.5 s


In [26]:
%%time
df['normalized_edit_distance'] = df.apply(lambda row: row.edit_distance / len(row.sentence.split(' ')), axis=1)

CPU times: total: 1 s
Wall time: 1 s


In [27]:
df

Unnamed: 0,sentence,sentence_no_spaces,sentenced_restored,edit_distance,normalized_edit_distance
0,plot two teen couples go to a church party dri...,plottwoteencouplesgotoachurchpartydrinkandthen...,plot two teen couples got o a church party dri...,2,0.153846
1,they get into an accident,theygetintoanaccident,they get into an accident,0,0.000000
2,one of the guys dies but his girlfriend contin...,oneoftheguysdiesbuthisgirlfriendcontinuestosee...,one of the guys dies but his girlfriend contin...,8,0.444444
3,what s the deal,whatsthedeal,what st he deal,2,0.500000
4,watch the movie and sorta find out,watchthemovieandsortafindout,watch them o v ie and sorta find out,4,0.571429
...,...,...,...,...,...
65253,it s a quick straight shot to the movie s end,itsaquickstraightshottothemoviesend,its a quick straight shot to them o v ie send,7,0.636364
65254,in terms of overall quality i would compare th...,intermsofoverallqualityiwouldcomparethetrumans...,inter ms of overall quality i would compare th...,5,0.333333
65255,both films are well made with interesting stor...,bothfilmsarewellmadewithinterestingstoriesseti...,both films are well made within t er e sting s...,7,0.583333
65256,but neither film really felt like it capitaliz...,butneitherfilmreallyfeltlikeitcapitalizedonall...,but neither film really felt like it capitaliz...,4,0.190476


In [28]:
np.mean(df['normalized_edit_distance'])

0.3794860799599877