In [1]:
import nltk
import re
from collections import Counter
import pandas as pd
#nltk.download("movie_reviews")
from nltk.corpus import movie_reviews as mr
from nltk.metrics.distance import edit_distance
import numpy as np

In [2]:
print(len(mr.words()), len(mr.sents()), len(mr.paras()))

1583820 65258 2000


In [3]:
mr.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [4]:
# put the sentences in a dataframe
list_of_sentences = []
for list_of_words in mr.sents():
    sentence = ' '.join(list_of_words)
    list_of_sentences.append({'sentence':re.sub('\W+', ' ', sentence).strip(), 'sentence_no_spaces':re.sub('\W+', '', sentence)})

df = pd.DataFrame(list_of_sentences)

In [5]:
df

Unnamed: 0,sentence,sentence_no_spaces
0,plot two teen couples go to a church party dri...,plottwoteencouplesgotoachurchpartydrinkandthen...
1,they get into an accident,theygetintoanaccident
2,one of the guys dies but his girlfriend contin...,oneoftheguysdiesbuthisgirlfriendcontinuestosee...
3,what s the deal,whatsthedeal
4,watch the movie and sorta find out,watchthemovieandsortafindout
...,...,...
65253,it s a quick straight shot to the movie s end,itsaquickstraightshottothemoviesend
65254,in terms of overall quality i would compare th...,intermsofoverallqualityiwouldcomparethetrumans...
65255,both films are well made with interesting stor...,bothfilmsarewellmadewithinterestingstoriesseti...
65256,but neither film really felt like it capitaliz...,butneitherfilmreallyfeltlikeitcapitalizedonall...


In [6]:
%%time
# build a dictionay
MAX_WORD_LEN = 15
count_segments = Counter()

# count everything
for sentence in df.sentence:
    for word in sentence.split(' '):
        count_segments[word] += 1
                               

CPU times: total: 531 ms
Wall time: 520 ms


In [7]:
len(count_segments)

39697

In [8]:
count_segments.most_common(20)

[('the', 76529),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('it', 16107),
 ('that', 15924),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961),
 ('his', 9587),
 ('this', 9578),
 ('film', 9517),
 ('i', 8889),
 ('he', 8864),
 ('but', 8634),
 ('on', 7385)]

In [9]:
%%time
# make a list of frequent enough words
bag_of_words = set()
for item in count_segments.elements():
    if count_segments[item] >= 5:
        bag_of_words.add(item)

CPU times: total: 266 ms
Wall time: 260 ms


In [10]:
len(bag_of_words)

14759

In [11]:
#go over sentences without spaces - look for the longest prefix
def sentence_segmentation(sentence):
    new_sentence = ''
    from_index = 0
    prefix_len=MAX_WORD_LEN
    while from_index < len(sentence):
        to_index = min(from_index+prefix_len, len(sentence))
        if sentence[from_index:to_index] in bag_of_words:
            new_sentence += sentence[from_index:to_index] + ' '
            from_index=to_index
            prefix_len=MAX_WORD_LEN
        else:
            prefix_len -= 1
            if prefix_len==0:
                prefix_len=MAX_WORD_LEN
                from_index+=1
        
    return new_sentence.strip()



In [12]:
print(df.sentence[10])
print(df.sentence_no_spaces[10])
sentence_segmentation(df.sentence_no_spaces[10])

it starts off normal but then downshifts into this fantasy world in which you as an audience member have no idea what s going on
itstartsoffnormalbutthendownshiftsintothisfantasyworldinwhichyouasanaudiencememberhavenoideawhatsgoingon


'its t arts off normal butt hen downs hi f t sin to this fantasy world in which you as an audience member haven o idea what s going on'

In [13]:
%%time
# add reconstructed sentences to the df
new_sentences = []
for sentence in df.sentence_no_spaces:
    new_sentences.append(sentence_segmentation(sentence))
   
    

CPU times: total: 11 s
Wall time: 11 s


In [14]:
df['sentenced_restored'] = new_sentences

In [15]:
df.sample(20)

Unnamed: 0,sentence,sentence_no_spaces,sentenced_restored
2718,liam neeson seems bored and embarrassed and se...,liamneesonseemsboredandembarrassedandseemsdesp...,liam neeson seems bored and embarrassed and se...
49667,it s a definite case of male bonding for the f...,itsadefinitecaseofmalebondingforthefivecrimina...,its ad e fi n it e case of male bonding forth ...
13042,in an effort to make this the biggest sociolog...,inanefforttomakethisthebiggestsociologicaleven...,inane f fort tom a k et his the biggest sociol...
38380,he returns the favour when he steps on stage o...,hereturnsthefavourwhenhestepsonstageonenightan...,here turns the favour when he steps on stage o...
5143,march may be attractive but at least judging b...,marchmaybeattractivebutatleastjudgingbythisper...,march maybe attractive but at least judging by...
64314,his building blocks are lots of small notes ti...,hisbuildingblocksarelotsofsmallnotestinygrains...,his building blocks are lots of small notes ti...
10608,it rips off a lot of good movies that are wort...,itripsoffalotofgoodmoviesthatareworthseeing,it rips off alot of good movies that are worth...
11183,nearly every scene between chris rock and morg...,nearlyeveryscenebetweenchrisrockandmorganfreem...,nearly every scene between chris rock and morg...
11189,freeman is effective as always though his perf...,freemaniseffectiveasalwaysthoughhisperformance...,freeman is effective as always though his perf...
65218,truman is really suspicious now,trumanisreallysuspiciousnow,truman is really suspicious now


In [16]:
# how well did we performed?

In [17]:
df.iloc[26627]

sentence              was i grossed out
sentence_no_spaces       wasigrossedout
sentenced_restored    was i grossed out
Name: 26627, dtype: object

In [18]:
# how many correct resconstruction

In [19]:
sum(df['sentenced_restored'] == df['sentence'])

11427

In [20]:
len(df)

65258

In [21]:
sum(df['sentenced_restored'] == df['sentence']) / len(df)

0.1751049679732753

In [22]:
# assign a normalized edit distance

In [23]:
df.iloc[26622]


sentence              sooner or later people start to tell themselve...
sentence_no_spaces    soonerorlaterpeoplestarttotellthemselvesthatit...
sentenced_restored    sooner or later peoples t art to tell themselv...
Name: 26622, dtype: object

In [24]:
edit_distance(df.iloc[26622].sentence, df.iloc[26622].sentenced_restored)

7

In [34]:
%%time
df['edit_distance'] = df.apply(lambda row: edit_distance(row.sentence.split(' '), row.sentenced_restored.split(' ')), axis=1)

CPU times: total: 42.2 s
Wall time: 42.3 s


In [35]:
%%time
df['normalized_edit_distance'] = df.apply(lambda row: row.edit_distance / len(row.sentence.split(' ')), axis=1)

CPU times: total: 1.02 s
Wall time: 1.01 s


In [36]:
df

Unnamed: 0,sentence,sentence_no_spaces,sentenced_restored,normalized_edit_distance,edit_distance
0,plot two teen couples go to a church party dri...,plottwoteencouplesgotoachurchpartydrinkandthen...,plot two teen couples got o a church party dri...,0.153846,2
1,they get into an accident,theygetintoanaccident,they get into an accident,0.000000,0
2,one of the guys dies but his girlfriend contin...,oneoftheguysdiesbuthisgirlfriendcontinuestosee...,one of the guys dies but his girlfriend contin...,0.444444,8
3,what s the deal,whatsthedeal,what st he deal,0.500000,2
4,watch the movie and sorta find out,watchthemovieandsortafindout,watch them o v ie and sorta find out,0.571429,4
...,...,...,...,...,...
65253,it s a quick straight shot to the movie s end,itsaquickstraightshottothemoviesend,its a quick straight shot to them o v ie send,0.636364,7
65254,in terms of overall quality i would compare th...,intermsofoverallqualityiwouldcomparethetrumans...,inter ms of overall quality i would compare th...,0.333333,5
65255,both films are well made with interesting stor...,bothfilmsarewellmadewithinterestingstoriesseti...,both films are well made within t er e sting s...,0.583333,7
65256,but neither film really felt like it capitaliz...,butneitherfilmreallyfeltlikeitcapitalizedonall...,but neither film really felt like it capitaliz...,0.190476,4


In [41]:
np.mean(df['normalized_edit_distance'])

0.3794860799599877