In [None]:
import sys
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence

import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

wiki = WikiCorpus('data/bxr/bxrwiki-latest-pages-articles.xml.bz2', lemmatize=False)

total = 0
with open('text.txt', 'w') as output:
    for text in wiki.get_texts():
        output.write(" ".join(text) + "\n")
        total += 1

print(total)

In [None]:
def check_bur_srt(word):

    lets = [0x4ae, 0x4af, 0x4ba, 0x4bb, 0x4e8, 0x4e9, 0x401, 0x451, 0x68, 0x48]
#     cyr_not = []
    
    for let in word:        
        code = int(hex(ord(let)),16)
        
        if (code<0x410 or code>0x42f) and (code<0x430 or code>0x44f) and code not in lets :
            return False
        
    return True

print(check_bur_srt('people'))
print(check_bur_srt('cinema capital'))
print(check_bur_srt('rgb'))
print(check_bur_srt('mall'))
print(check_bur_srt('сырдајя'))

print(check_bur_srt('харагшадые'))
print(check_bur_srt('һүүлэй'))
print(check_bur_srt('гоё'))
print(check_bur_srt('гэхэ'))
print(check_bur_srt('восток'))
print(check_bur_srt('гэhэн'))
print(check_bur_srt('һүүлэй'))
print(check_bur_srt('мүнөө'))
print(check_bur_srt('пионер'))
print(check_bur_srt('нэрлэбэл'))
print(check_bur_srt('байранууд'))
print(check_bur_srt('юм'))

print(check_bur_srt('хонин'))
print(check_bur_srt('шүүһэн'))
print(check_bur_srt('хубсаһан'))
print(check_bur_srt('волга'))

In [None]:
# from 3 schar
# sort by len

suff= {
    'plural_con':['ууд', 'үүд'],
    'plural_vow': ['нууд', 'нүүд'],
    'plural_n': ['гууд', 'гүүд'],
    'plural_n1': ['гша', 'гшэ', 'гшо', 'ааша', 'ээшэ', 'оошо', 'өөшэ', 'д'],
    'plural_pep': ['нар', 'нэр', 'нор'],
    'lich_prityj': ['мни', 'ни', 'мнай', 'най', 'шни', 'тнай', 'иинь', 'ынь', 'нь'],
    'bezlich_prityj': ['нгаа', 'нгээ', 'нгоо', 'нгөө', 'гаа', 'гээ', 'гоо', 'гөө', 'аа', 'ээ', 'оо', 'өө', 'гаа', 
                      'гээ', 'гоо', 'гөө', 'аа', 'ээ', 'оо', 'өө', 'яа', 'еэ', 'ёо', 'аа', 'ээ', 'оо', 'өө', 
                      'яа', 'еэ', 'ёо' 'гаа', 'гээ', 'гоо', 'гөө', 'н'],
    'cases':['ай', 'эй', 'ой', 'гай', 'гой', 'ын', 'иин', 'н', 'да', 'до', 'дэ', 'та', 'тэ', 'то', 'ые', 'ы', 
             'иие', 'ии', 'е', 'гые', 'гы', 'аар', 'ээр', 'оор', 'өөр', 'яар', 'еэр', 'ёор', 'гаар', 'гээр', 
             'гоор', 'гөөр', 'тай', 'тэй', 'той', 'һаа', 'һээ', 'һоо', 'һөө', 'гһаа', 'гһээ', 'гһоо', 'гһөө'],
    'verbs_present': ['б', 'м', 'бди', 'мди', 'ш', 'т'],
    'past': ['ба', 'бэ', 'бо'],
    'future': ['ха', 'хэ', 'хо'],
    'negation': ['үдүй', 'дуй', 'үгы', 'гүй']
}

suffs= set([suf for l1 in suff.values() for suf in l1])

def rem_suf(word):
    if len(word)>4:
        if word[-4:] in suffs:
            return word[:-4]
        if word[-3:] in suffs:
            return word[:-3]
    
    return word

print(rem_suf('тойрогһоо'))
print(rem_suf('ябадалтнай'))
print(rem_suf('сахижа'))
print(rem_suf('хэ'))

In [None]:
# filter short words, non-byr chatacters

def post_sen(sen):
    mod_sen = []
    for word in sen.split(" "):
        word = word.rstrip()
        
        if len(word)<3:
            mod_sen.append('UNK')
            continue
        
        if check_bur_srt(word):
#             mod_sen.append(word)
            mod_sen.append(rem_suf(word))
        else:
            mod_sen.append('UNK')
            
            
    return ' '.join(mod_sen)

In [None]:
new_text = []

with open("./text.txt") as file:
    for l in file:
        new_text.append(post_sen(l))
        
print('number of articles:', len(new_text))
with open('./bur_text_no_suf.txt', 'w') as f:
    for text in new_text:
        f.write(text+'\n')

In [None]:
# build clusters
import sys
vocab = []

with open("./bur_text_no_suf.txt") as file:
# with open("./bur_text.txt") as file:
    for l in file:
        vocab = vocab + l.rstrip().split(" ")
        
print('the number of words:', len(vocab))

uniq = set(vocab)
print('the number of unique words:', len(uniq))
uniq.remove('UNK')
print('the number of unique words without UNK:', len(uniq))

vocab = sorted(list(uniq))

vocab_by_letter = {}
for word in vocab:
    if word[0] not in vocab_by_letter:
        vocab_by_letter[word[0]] = []

    vocab_by_letter[word[0]].append(word)

idx = 0
for let, words in vocab_by_letter.items():
    print('letters index', idx, len(words))
    if len(words)>100:
        dists = [d3(word1, word2) for idx1, word1 in enumerate(words) for word2 in words[idx1+1:]]

        with open('./let_no_suf/vocab.'+str(idx), 'w') as f:
            for word in words:
                f.write(word+'\n')

        with open('./let_no_suf/dists.'+str(idx), 'w') as f:
            for dist in dists:
                f.write(str(dist)+'\n')

        idx += 1

In [None]:
import math
import sys

def padd(a, b):
    maxlen=max(len(a), len(b))
    return a.ljust(maxlen), b.ljust(maxlen)

def d3(a, b):
    if (a==b):
        return 0
    
    a, b = padd(a, b)
    n = len(a)-1
    m=0
    
    for i in range(n):
        if a[i]!=b[i]:
            m = i
            break

    if m==0:
        return sys.float_info.max 
            
#     print('n', n)
#     print('m', m)
    
    summ = 0
    for i in range(m,n+1):
        summ += 1/(2**(i-m))
    
    return float((n-m+1)/m)*summ

print(d3('astronomer', 'astronomically'))
print(d3('astronomer', 'astonish'))
print(d3(u'хэлэнэй', u'хэлэниинь'))
print(d3(u'хэлэнүүдэй', u'хэлэниинь'))
print(d3(u'хэлэнэй', u'шэнжэлэл'))
print(d3('хэлэнэй', 'шэнжэлэл'))
print(d3('a', 'b'))

In [None]:
cl = fcluster(Z, t=1, criterion='distance')
print(max(cl))
    
clust = {}
for idx, num in enumerate(cl):
    if num not in clust:
        clust[num] = []
    
    clust[num].append(vocab[idx])
    if len(clust[num])>1:
        print(clust[num])
 
print(clust)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

th = []
res = []

for j in np.arange(0.0, 10.0, 0.25):
    th.append(j)
    cl = fcluster(Z, t=j, criterion='distance')
    res.append(max(cl))
    print(j)
    
import matplotlib.pyplot as plt

plt.scatter(th, res)
plt.show()

In [None]:
from scipy.cluster.hierarchy import *
import fastcluster

def build_clusters(linkage, th, folder):
# common buffer for all more than 1 clusters
# key: letter index
# value: list of lists of clusters with words
    buffer = {}
    
    print("load distances from:", folder)

    for i in range(0,31):
        print('letter index:', i)
        vocab = []
        dists = []
        buffer[i] = []
    
        with open('./'+folder+'/vocab.'+str(i), 'r') as f:
            for l in f:
                vocab.append(l.rstrip())

        with open('./'+folder+'/dists.'+str(i), 'r') as f:
            for l in f:
                dists.append(float(l.rstrip()))
            
        
    
    #   Z = single(dists)
    # cl = fcluster(Z, t=th, criterion='distance')
    
        Z = fastcluster.linkage(dists, method=linkage)
        cl = fcluster(Z, t=th, criterion='distance')
    
        print("calculated")
 
        clust = {}
        for idx, num in enumerate(cl):
            if num not in clust:
                clust[num] = []
    
            clust[num].append(vocab[idx])
        
        for key, value in clust.items():
            if len(value)>1:
                buffer[i].append(value)
    
        print('total clusters:', len(buffer[i]))
        
    return buffer

In [None]:
print(buffer[21])

In [None]:
def replace_stem(word_lemma, filename):
    post = []

    print('load text from file:', filename)
# load filtered text
    with open(filename) as file:
        for l in file:
            post.append(l.rstrip().strip())

    print('total articles:', len(post))
    print(type(post[0]))
    post_lemma = []

    for idx, sen in enumerate(post):
        if idx % 100==0:
            print(idx)
        mod = sen
        for key, value in word_lemma.items():
            mod = mod.replace(key, value)
    
        post_lemma.append(mod)
    print('total articles lemmatized:', len(post_lemma))
    
#     with open('post_lemma.txt', 'w') as f:
#         for sen in post_lemma:        
#             f.write(sen+'\n')

    return post_lemma

In [None]:
# ONLY FOR NON_CLUSTER PERFORMANCE
post_lemma = []

with open("bur_text.txt") as file:
    for l in file:
        post_lemma.append(l.rstrip().strip())

for ng in [2,5,10]:
    word_context = build_pairs(post_lemma, ng)
    c = dict(Counter(word_context))
    save(c, 'none', 0, post_lemma, {}, 'pmi', ng)    

In [None]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
# from collections import Counter
# import numpy as np
# import sys
# from collections import Counter

def build_pairs(post_lemma, ng):
    omit = ['BEG', 'END', 'UNK']
    word_context = []
    words = []

    print('total articles', len(post_lemma))

    for idx, text in enumerate(post_lemma):
        
        text = text.split(" ")
        
        for i in range(ng):
            text = ['BEG'] + text
            text = text + ['END']

        for idx in range(ng,len(text)-ng):
            for i in range(idx-ng, idx):
                if text[i] not in omit:
                    word_context.append((text[idx], text[i]))
                
            for i in range(idx+1, idx+ng+1):
                if text[i] not in omit:
                    word_context.append((text[idx], text[i]))

    print('build_pairs, ng:', ng, ' num of pairs:', len(word_context))
    return word_context

print(build_pairs(['I like apples and alcohol'],2))
print(build_pairs(['I like apples and alcohol is this OK'],3))
print(build_pairs(['I like apples and alcohol'],1))

In [None]:
from nltk.util import ngrams

print(list(ngrams(['BEG','BEG','I', 'like', 'apples','END','END'],3)))

In [None]:
# from collections import Counter
# # key: words
# # value: occurences
# c_w = dict(Counter(words))

In [None]:
# # pairs (word, occurences)
# tups = [(k,v) for k,v in c_w.items()]
# print('vocab total:', len(tups))

# #  sort by occurences
# # print(sorted(tups, key = lambda x: x[1]))

# # only tuples that > 4
# tups5 = set([w[0] for w in tups if w[1]>4])
# print('vocab5 number:', len(tups5))

# word_context5 = []
# for tup in word_context:
#     if tup[0] in tups5:
#         word_context5.append(tup)
        
# print('word_context:', len(word_context))
# print('word_context5:', len(word_context5))

In [None]:
# from collections import Counter
# # c = dict(Counter(word_context5))
# c = dict(Counter(word_context))

In [None]:
import numpy as np

def save(c, linkage, th, post_lemma, word_lemma, folder, ng):
    cols = []
    rows = []
    data = []

    for key, value in c.items():
        rows.append(key[0])
        cols.append(key[1])
        data.append(key[0] + " " + key[1] + " " + str(value))
    
    rows = np.unique(rows)
    cols = np.unique(cols)

    with open('./'+folder+'/'+linkage+'-'+str(th)+'-'+str(ng)+'-cols', 'w') as f:
        for col in cols:
            f.write(col + "\n")

    with open('./'+folder+'/'+linkage+'-'+str(th)+'-'+str(ng)+'-rows', 'w') as f:
        for row in rows:
            f.write(row + "\n")

    with open('./'+folder+'/'+linkage+'-'+str(th)+'-'+str(ng)+'-data', 'w') as f:
        for dt in data:
            f.write(dt + "\n")
            
    with open('./'+folder+'/'+linkage+'-'+str(th)+'-'+str(ng)+'-lemma', 'w') as f:
        for key, value in word_lemma.items():        
            f.write(key+'\n')
            f.write(value+'\n')
    
    with open('./'+folder+'/'+linkage+'-'+str(th)+'-'+str(ng)+'-text', 'w') as f:
        for txt in post_lemma:        
            f.write(txt+'\n')

In [None]:
from collections import Counter

# FloatingPointError: NaN dissimilarity value in intermediate results.
# centroid
# median
# ward


# method='mcquitty':

linkages = ['average', 'complete']

for linkage in linkages:
    
    for th in np.arange(4,6,0.1):
        print('LINKAGE:', linkage, 'th:', th)
        
        buffer = build_clusters(linkage, th, 'let')#let_no_suf
    
        buffer_global = []
        for key, value in buffer.items():
            buffer_global += value

        print('total clusters:',len(buffer_global))
        
        if len(buffer_global)>100:
            word_lemma = {}

            for cluster in buffer_global:
                lemma = min(cluster, key=len)
                for word in cluster:
                    if word != lemma:
                        word_lemma[word] = lemma

            post_lemma =  replace_stem(word_lemma, 'bur_text.txt')#bur_text_no_suf.txt
            
            for ng in [2,5,10]:
                word_context = build_pairs(post_lemma, ng)
                c = dict(Counter(word_context))
                save(c, linkage, th, post_lemma, word_lemma, 'pmi',ng)

In [None]:
# NO LEMMATIZATION
post_lemma =  replace_stem({})
word_context = build_pairs(post_lemma)
c = dict(Counter(word_context))
save(c, 'none', 0)

In [None]:
post_lemma = []

with open("./pmi/average-1.5-text") as file:
    for l in file:
        post_lemma.append(l.rstrip().strip())

for ng in [2,5,10]:
    word_context = build_pairs(post_lemma, ng)
    c = dict(Counter(word_context))
    save(c, 'average', '1.5', post_lemma, {}, 'pmi' ,ng)