In [1]:
from collections import defaultdict

In [2]:
import difflib

In [3]:
import pandas as pd
from collections import Counter
import numpy as np
from io import open

In [4]:
from sklearn.cross_validation import train_test_split



In [5]:
class Pmf(Counter):
    """A Counter with probabilities."""

    def normalize(self):
        """Normalizes the PMF so the probabilities add to 1."""
        total = float(sum(self.values()))
        for key in self:
            self[key] /= total

    def __add__(self, other):
        """Adds two distributions.

        The result is the distribution of sums of values from the
        two distributions.

        other: Pmf

        returns: new Pmf
        """
        pmf = Pmf()
        for key1, prob1 in self.items():
            for key2, prob2 in other.items():
                pmf[key1 + key2] += prob1 * prob2
        return pmf

    def __hash__(self):
        """Returns an integer hash value."""
        return id(self)
    
    def __eq__(self, other):
        return self is other

    def render(self):
        """Returns values and their probabilities, suitable for plotting."""
        return zip(*sorted(self.items()))
    
    def sample(self):
        keys, vals= zip(*self.items())
        return np.random.choice(keys, p=vals)

In [6]:
def is_shm(p1, p2):
    if (p2.find('schm')) == 0 or (p2.find('shm')==0):
        return True
    return False

In [7]:
def is_duplicate(p1, p2):
    return p1==p2

In [8]:
def is_change_consonant(p1, p2):
    changes = list(difflib.ndiff(p1,p2))
    vowel_set=set(['a','e','i','o','u'])
    deleted_consonants=[]
    added_consonants=[]
    for c in changes:
        if '-' in c and c.split(' ')[1] not in vowel_set:
            deleted_consonants.append(c.split(' ')[1])
        elif '+' in c and c.split(' ')[1]:
            added_consonants.append(c.split(' ')[1])
    if (len(added_consonants)==1) and len(deleted_consonants)==len(added_consonants):
        if deleted_consonants[0]==added_consonants[0]:
            print p1, p2
        return True, deleted_consonants[0], added_consonants[0]
    else:
        return False,None, None

In [9]:
def is_change_vowel(p1, p2):
    changes = list(difflib.ndiff(p1,p2))
    vowel_set=set(['a','e','i','o','u'])
    deleted_vowels=[]
    added_vowels=[]
    for c in changes:
        if '-' in c and c.split(' ')[1] in vowel_set:
            deleted_vowels.append(c.split(' ')[1])
        elif '+' in c and c.split(' ')[1]:
            added_vowels.append(c.split(' ')[1])
    if (len(added_vowels)==1) and len(deleted_vowels)==len(added_vowels):
        return True, deleted_vowels[0], added_vowels[0]
    else:
        return False,None, None

In [10]:
def is_added_vowel(p1, p2):
    changes = list(difflib.ndiff(p1,p2))
    vowel_set=set(['a','e','i','o','u'])
    deleted_vowels=[]
    added_vowels=[]
    for c in changes:
        if '-' in c:
            return False, None,None
        elif '+' in c and c.split(' ')[1] in vowel_set:
            added_vowels.append(c.split(' ')[1])
    if (len(added_vowels)==1):
        return True, added_vowels[0], p1[0]
    else:
        return False,None, None

In [11]:
def is_added_consonant(p1, p2):
    changes = list(difflib.ndiff(p1,p2))
    vowel_set=set(['a','e','i','o','u'])
    added_cons=[]
    for c in changes:
        if '-' in c:
            return False, None, None
        elif '+' in c and c.split(' ')[1] not in vowel_set:
            added_cons.append(c.split(' ')[1])
    if (len(added_cons)==1):
        return True, added_cons[0], p1[0]
    else:
        return False,None, None

In [13]:
df = pd.read_csv('./data/matiello_data.csv', sep='\t', encoding='utf-8')

In [14]:
redups=df[df.label==3].word.values

In [15]:
redups_tokens=[]
for r in redups:
    tokens=r.strip().split('-')
    if len(tokens) == 2:
        redups_tokens.append((tokens[0].lower(), tokens[1].lower()))
    else:
        tokens=r.strip().split(' ')
        if len(tokens) == 2:
            redups_tokens.append((tokens[0].lower(), tokens[1].lower()))

In [16]:
redups_tokens_train, redups_tokens_test = train_test_split(redups_tokens, test_size=50)

In [61]:
f=open('./data/redups_train_prob_model.txt','w')
for l in redups_tokens_train:
    f.write(u' '.join(l)+u'\n')
f.close()

In [62]:
f=open('./data/redups_test_prob_model.txt','w')
for l in redups_tokens_test:
    f.write(u' '.join(l)+u'\n')
f.close()

In [25]:
def learn_model(redups_tokens):
    c=Pmf()
    vowel_counter=Counter()
    consonant_counter=Counter()
    added_vowel_counter=Counter()
    added_consonant_counter=Counter()

    vowel_counter_tot=Pmf()
    consonant_counter_tot=Pmf()
    added_vowel_counter_tot=Pmf()
    added_consonant_counter_tot=Pmf()

    for redup in redups_tokens:
        if redup[0]==redup[1]:
            c['DUPLICATE']+=1
        elif is_added_vowel(redup[0], redup[1])[0]:
            c['AV']+=1
            _,av, fl=is_added_vowel(redup[0], redup[1])
            if fl not in added_vowel_counter:
                added_vowel_counter[fl]=Pmf()
            added_vowel_counter[fl][av]+=1
            added_vowel_counter_tot[av]+=1
        elif is_added_consonant(redup[0], redup[1])[0]:
            #print redup[0], redup[1]
            c['AC']+=1
            _,ac, fl = is_added_consonant(redup[0], redup[1])
            if fl not in added_consonant_counter:
                added_consonant_counter[fl]=Pmf()
            added_consonant_counter[fl][ac]+=1
            added_consonant_counter_tot[ac]+=1
        elif is_change_vowel(redup[0], redup[1])[0]:
            c['VOWEL']+=1
            _, dv, av = is_change_vowel(redup[0], redup[1])
            if dv not in vowel_counter:
                vowel_counter[dv]=Pmf()
            vowel_counter[dv][av]+=1
            vowel_counter_tot[dv]+=1
        elif is_change_consonant(redup[0], redup[1])[0]:
            c['CONSONANT']+=1
            _, dv, av = is_change_consonant(redup[0], redup[1])
            if dv not in consonant_counter:
                consonant_counter[dv]=Pmf()
            consonant_counter[dv][av]+=1
            consonant_counter_tot[dv]+=1
        elif is_shm(redup[0], redup[1]):
            c['SHM']+=1
        else:
            #print "Unknown", redup
            c['UNKNOWN']+=1

    c.normalize()

    for d in vowel_counter.values():
        d.normalize()

    for d in consonant_counter.values():
        d.normalize()

    vowel_counter_tot.normalize()
    consonant_counter_tot.normalize()

    for d in added_consonant_counter.values():
        d.normalize()

    added_consonant_counter_tot.normalize()
    return c, consonant_counter, consonant_counter_tot, vowel_counter,\
    vowel_counter_tot, added_vowel_counter, added_vowel_counter_tot, added_consonant_counter, added_consonant_counter_tot

In [37]:
c, consonant_counter, consonant_counter_tot, vowel_counter,vowel_counter_tot,\
added_vowel_counter, added_vowel_counter_tot, added_consonant_counter, added_consonant_counter_tot=learn_model(redups_tokens_train)

In [27]:
def generate_add_ac(s,added_consonant_counter):
    if s[0] in added_consonant_counter:
        ac=added_consonant_counter[s[0]].sample()
        return ac+s
    return s

In [28]:
def generate_replace(s, total_counter, bicounter):
    pos=-1
    for i, ch in enumerate(list(s)):
        if ch in set(total_counter.keys()):
            pos=i
            break
    ch_target=bicounter[ch].sample()
    ret_str=list(s)
    ret_str[pos]=ch_target
    return ''.join(ret_str)

In [29]:
def can_replace_vowels(s):
    return len(set(list(s)) & set(vowel_counter_tot.keys())) >0

In [30]:
def can_replace_consonants(s):
    return len(set(list(s)) & set(consonant_counter_tot.keys()))>0

In [31]:
def generate_reduplicative(s, c):
    newc=Pmf()
    newc['DUPLICATE']=c['DUPLICATE']
    if can_replace_vowels(s):
        newc['VOWEL']=c['VOWEL']
    if can_replace_consonants(s):
        newc['CONSONANT']=c['CONSONANT']
    if s[0] in added_consonant_counter:
        newc['AC']=c['AC']
    newc.normalize()
    l1 = newc.sample()
    if l1 == 'DUPLICATE':
        return s
    elif l1 == 'AC':
        return generate_add_ac(s, added_consonant_counter)
    elif l1 == 'VOWEL':
        return generate_replace(s, vowel_counter_tot, vowel_counter)
    elif l1 == 'CONSONANT':
        return generate_replace(s, consonant_counter_tot, consonant_counter)

In [32]:
def random_gen_redup(s):
    pos=np.random.choice(len(s))
    if s[pos] in set(['a','e','i','o','u']):
        candidates=set(['a','e','i','o','u'])-set(s[pos])
        ch = np.random.choice(list(candidates))
        t=list(s)
        t[pos]=ch
        return ''.join(t)
    else:
        consonants=set(['b','c','d','f','g','h','j','k','l','m','n','p','q','r','s','t','v','w','x','y','z'])
        assert(len(consonants)==21)
        candidates=consonants-set(s[pos])
        ch = np.random.choice(list(candidates))
        t=list(s)
        t[pos]=ch
        return ''.join(t)

In [33]:
generate_reduplicative('flip', c)

u'flop'

In [34]:
def generate_on_test_data(test_data_file, output_directory):
    for run in np.arange(0,10):
        g = open('{}/redups_prob_model_{}.txt'.format(output_directory, run),'w')
        f = open(test_data_file)
        for line in f:
            redup_tokens = line.strip().split(' ')
            generated_token = generate_reduplicative(redup_tokens[0], c)
            g.write(u' '.join([redup_tokens[0], generated_token]))
            g.write(u'\n')
        g.close()  
        
    for run in np.arange(0,10):
        nodupc=c.copy()
        nodupc['DUPLICATE']=0.0
        g = open('{}/redups_prob_nodups_model_{}.txt'.format(output_directory, run),'w')
        f=open(test_data_file)
        for line in f:
            redup_tokens = line.strip().split(' ')
            generated_token = generate_reduplicative(redup_tokens[0], nodupc)
            g.write(u' '.join([redup_tokens[0], generated_token]))
            g.write(u'\n')
        g.close()    

    for run in np.arange(0,10):
        g = open('{}/redups_prob_random_model_{}.txt'.format(output_directory, run),'w')
        f=open(test_data_file)
        for line in f:
            redup_tokens = line.strip().split(' ')
            generated_token = random_gen_redup(redup_tokens[0])
            g.write(u' '.join([redup_tokens[0], generated_token]))
            g.write(u'\n')
        g.close()    

    for run in np.arange(0,10):
        g = open('{}/redups_prob_random_char_model_{}.txt'.format(output_directory, run),'w')
        f=open(test_data_file)
        for line in f:
            redup_tokens = line.strip().split(' ')
            generated_token = random_gen_char(redup_tokens[0])
            g.write(u' '.join([redup_tokens[0], generated_token]))
            g.write(u'\n')
        g.close()    

In [119]:
#generate_on_test_data('./data/redups_test_prob_model.txt', './output/gold_test/')

In [120]:
#generate_on_test_data('./data/redups_test_ud.txt', './output/test_ud/')