# 102034038 湯忠憲 NLP Lab2

In [1]:
#improt API
from NetSpeakAPI import NetSpeak
from LinggleAPI import Linggle

In [2]:
#Lab1 correction functions
import re, collections

def words(text): return re.findall('[a-z]+', text.lower())

def words_and_bigrams(text):
    all_words = words(text)
    bigrams = [ ' '.join(all_words[i:i+2]) for i in range(len(all_words)-1) ]
    return all_words + bigrams

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words_and_bigrams(open('big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet+' ' ]
    return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

# list all possible fusion error candidate
def fusions_edit(word):
    word        = word.replace(' ', '')
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    fusions    = [(a+' '+b) for a, b in splits if a in NWORDS and b in NWORDS]
    if word in NWORDS:
        fusions += [word]
    return fusions

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    fusions = fusions_edit(word)
    # correct with fusion error first
    if fusions:
        return sorted(fusions, key=NWORDS.get)[:10]
    # correct with edit candidate if no fusion candidate found
    return sorted(candidates, key=NWORDS.get)[:10]

#function to generate candidates for correcting
def generate_candidates(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word] or fusions_edit(word)
    return candidates


In [3]:
generate_candidates("weanter")

{'banter',
 'canter',
 'center',
 'chanter',
 'decanter',
 'easter',
 'enter',
 'heater',
 'planter',
 'renter',
 'shanter',
 'sweater',
 'venter',
 'waiter',
 'walter',
 'wander',
 'wanted',
 'water',
 'we enter',
 'weaker',
 'weaned',
 'wearer',
 'weather',
 'weaver',
 'webster',
 'welter',
 'werner',
 'wetter',
 'winter'}

### Load testing data here!!

In [4]:
count = 0
error = []
correct = []
for i in open("lab2.test.1.txt").readlines():
    count += 1
    error.append(i.split("\t")[0])
    correct.append(i.split("\t")[1])
print "number of lines to correct:",count

number of lines to correct: 198


### Load confusables text

In [5]:
confusable_dict = {}
for line in open("lab2.confusables.txt").readlines():
    line_array = line.split("\t")
    confusable_dict[line_array[0]] = line_array[1].strip()
confusable_dict

{'accede': 'exceed',
 'accept': 'except',
 'adept': 'adopt',
 'adverse': 'averse',
 'advice': 'advise',
 'affect': 'effect',
 'aid': 'aide',
 'airs': 'heirs',
 'allude': 'elude',
 'allusion': 'illusion',
 'almost': 'most',
 'aloud': 'allowed',
 'alternately': 'alternatively',
 'ambiguous': 'ambivalent',
 'amiable': 'amicable',
 'among': 'between',
 'amoral': 'immoral',
 'amount': 'number',
 'amused': 'bemused',
 'an': 'and',
 'annual': 'annul',
 'anywheres': 'nowheres',
 'apart': 'a part',
 'appraise': 'apprise',
 'arcane': 'archaic',
 'as': 'like',
 'ascent': 'assent',
 'ascetic': 'aesthetic',
 'ascribe': 'describe',
 'aspersion': 'dispersion',
 'assent': 'ascent',
 'assistance': 'assistants',
 'auger': 'augur',
 'bad': 'badly',
 'baited': 'bated',
 'bare': 'bear',
 'bazaar': 'bizarre',
 'belief': 'believe',
 'beside': 'besides',
 'better': 'had better',
 'between': 'among',
 'biannual': 'biennial',
 'bimonthly': 'semimonthly',
 'blithe': 'lithe',
 'blonde': 'blond',
 'boar': 'boor',


In [6]:
confusable_dict["amused"]

'bemused'

### normalize text

In [7]:
clean_error = [string.strip().lower().replace(".","") for string in error]
clean_error

['i felt very strang',
 'at brake time',
 'when the brack was finished',
 'in the weanter when it was snowing',
 'i thought it was a gost',
 'everything expect the houses',
 'when i first steped',
 'and saw streagh colow people',
 'and saw streagh colow people',
 'i was on an exclation',
 'i noicey that i was on this thing',
 'through the fance',
 'the hunters kille them',
 'they kill birds with their nerrow',
 'make a depe hole',
 'to tidy up his gardon',
 'the wind belu the leaves',
 'mr j was very angray',
 'garden full of leavs',
 'talk to the manger',
 'they throw a aero',
 'an ansion method of hunting',
 'after the dear',
 'the birds flu up',
 'making any noice',
 'bring it stright to the hunters',
 'this menes a man waits',
 'a man waight for the animal',
 'an arow in his hand',
 'they dick a hole',
 'a pice of bait',
 'this tarpp looks like a cave',
 'they make nose',
 'the animals are skeard',
 'the ribbetes will run',
 'they throw stons',
 'they chaching the animals',
 'three

### My Correction function !

In [8]:
def ngram_correction(test, ngram = 3): #input a list of word and n-gram
    for i in range(len(test) - ngram+1):
        candidate_count = []
        for position, word in enumerate(test[i:i + ngram]):
            candidates = list(generate_candidates(word))
            if word in confusable_dict:
                candidates.append(confusable_dict[word])            
            for candidate in candidates:
                #print (position,candidate)
                input_string = test[i:i + ngram]
                input_string[position] = candidate
                #print "search string:",' '.join(input_string)
                res = SE.search(' '.join(input_string))
                if res:
                    #print res
                    candidate_count.append(res[0])
                else:
                    print('not found')
        try:
            
            max_count_bigram_output = sorted(candidate_count,key=itemgetter(1),reverse= True)[0][0]
            print max_count_bigram_output
            if i ==0:
                #print "cccccandate:", max_count_bigram_output.split()
                correct_sentence.append(max_count_bigram_output)
            else:
                #print "cccccandate:", max_count_bigram_output.split()[1]
                correct_sentence.append(max_count_bigram_output.split()[ngram-1])
        # If Linggle can not find at least one new word combination, just concatenate the original words
        except:
            if i ==0:
                correct_sentence.append(" ".join(test[i:i + ngram]))
            else:
                correct_sentence.append(test[i+ngram-1])

In [9]:
from operator import itemgetter
#SE = NetSpeak()
SE = Linggle()
final_output = []
count = 0
for sentence in error:
    test = sentence.lower().split()
    correct_sentence  = []
    
    # if error string has more than two words, use trigram
    if len(test)>2:     
        ngram_correction(test, ngram = 3)
        
    # use bigram if only two words
    else:
        ngram_correction(test, ngram = 2)
    print " ".join(correct_sentence)
    final_output.append(" ".join(correct_sentence))

i felt very
not found
not found
not found
i felt very strang
not found
not found
not found
at break time
at break time
not found
not found
not found
not found
when the black
not found
not found
not found
not found
the track was
not found
not found
not found
not found
not found
not found
not found
not found
not found
track was finished
when the black was finished
not found
not found
not found
not found
not found
in the center
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
the water when
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
water when it
when it was
it was snowing
in the center when it was snowing
i thought it
thought it was
it was a
not found
not found
not found
not found
not found
not found
not found
not found
was 

Basically, I compute overlapping 3-gram count of candidates to select the best correction. However, some of the sentenses have only two word. Therefore, I use bigram instead of 3-gram and compute count of candidates. If **Linggle** can not find at least one new words combination, I just concatenate the origin sub-string to the correction sentence. In terms of candidates, I extend the big.txt and include confusable words as well.

### correction output

In [10]:
final_output

[u'i felt very strang',
 u'at break time',
 u'when the black was finished',
 u'in the center when it was snowing',
 u'i thought it was a most',
 u'everything expect the houses',
 u'when i first stepped',
 u'and saw streaks colow people',
 u'and saw streaks colow people',
 u'i was on an exception',
 u'i noticed that i was on this thing',
 u'through the fence',
 u'the hunters killed them',
 u'they kill birds with their narrow',
 u'make a deep hole',
 u'to tidy up his garden',
 u'the wind belt the leaves',
 u'mr j. was very angry',
 u'garden full of leaves',
 u'talk to the manger',
 u'they throw a hero',
 u'an ansion method of hunting',
 u'after the dear',
 'the birds flu up',
 u'making any noise',
 u'bring it straight to the hunters',
 u'this menes a man waits',
 u'a man waits for the animal',
 u'an arrow in his hand',
 u'they dig a hole',
 u'a piece of bait',
 u'this tarpp looks like a cave',
 'they make nose',
 u'the animals are heard',
 u'the diabetes will run',
 u'they throw stones',

### normalize groundtruth text for comparison

In [11]:
cleaned_correct = []
for string in correct:
    cleaned_correct.append(string.strip().lower().replace(".",""))
cleaned_correct

['i felt very strange',
 'at break time',
 'when the break was finished',
 'in the winter when it was snowing',
 'i thought it was a ghost',
 'everything except the houses',
 'when i first stepped',
 'and saw strange colow people',
 'and saw streagh coloured people',
 'i was on an escalator',
 'i noticed that i was on this thing',
 'through the fence',
 'the hunters kill them',
 'they kill birds with their arrow',
 'make a deep hole',
 'to tidy up his garden',
 'the wind blew the leaves',
 'mr j was very angry',
 'garden full of leaves',
 'talk to the manager',
 'they throw a arrow',
 'an ancient method of hunting',
 'after the deer',
 'the birds flew up',
 'making any noise',
 'bring it straight to the hunters',
 'this means a man waits',
 'a man waits for the animal',
 'an arrow in his hand',
 'they dig a hole',
 'a piece of bait',
 'this trap looks like a cave',
 'they make noise',
 'the animals are scared',
 'the rabbits will run',
 'they throw stones',
 'they chasing the animals',

### calculate hits 

In [12]:
same_count = 0
hits = 0
corrections = 0
final_output_wordList = []
for string_output, truth,error in zip(final_output,cleaned_correct,clean_error):
    for word_output, word_truth,word_error in zip(string_output.split(),truth.split(),error.split()):
        if word_output == word_truth and word_output != word_error:
            hits +=1 
            corrections += 1
        elif word_output != word_error:
            corrections += 1

In [13]:
print "hit:",hits
print "correcitons:",corrections

hit: 100
correcitons: 141


In [15]:
precision = float(hits)/corrections
recall = float(hits)/183
falsealarm = float(corrections-hits)/corrections
print "Precision:", precision
print "Recall:", recall#we have total 183 errors
print "FalseAlarm:",falsealarm

 Precision: 0.709219858156
Recall: 0.546448087432
FalseAlarm: 0.290780141844


In [16]:
print "F1-score:" ,2 *precision*recall / (precision+recall)

F1-score: 0.617283950617


## Challenges

1. In the begining, it tooks lots of time to correct the sentences since Netspeak was slow. After Teacher released Linggle API, I felt comfortable correcting the sentences!
2. Some words were not in the big.text. So even if Linggle is powerful, it couldn't be fed by some other potential candidates. Therefore, I added text from brown corpus. The performance was a little bit improved.
3. Considering confusable words is useful.