In [11]:
train_path = "/home/com/burvec/UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
test_path = "/home/com/burvec/UD_Buryat-BDT/bxr_bdt-ud-test.conllu"

In [12]:
def read_conll(in_file, lowercase=True, max_example=None):
    examples = []
    with open(in_file) as f:
        word, pos, head, label = [], [], [], []
        for line in f.readlines():
            sp = line.strip().split('\t')
            if len(sp) == 10:
                if '-' not in sp[0]:
                    word.append(sp[1].lower() if lowercase else sp[1])
                    pos.append(sp[3])
                    head.append(int(sp[6]))
                    label.append(sp[7])
            elif len(word) > 0:
                examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
                word, pos, head, label = [], [], [], []
                if (max_example is not None) and (len(examples) == max_example):
                    break
        if len(word) > 0:
            examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
    return examples 

In [13]:
train = read_conll(train_path)
test = read_conll(test_path)

In [14]:
def get_train(we, vocab):
    words = []
    poss = []
    for sen in train:
        words.extend(sen['word'])
        poss.extend(sen['pos'])
    X_train = [we[w] for w in words if w in vocab]
    y_train = [poss[i] for i, w in enumerate(words) if w in vocab]
    return X_train, y_train

In [15]:
def get_test(we, vocab):
    words = []
    poss = []
    for sen in test:
        words.extend(sen['word'])
        poss.extend(sen['pos'])
    X_test = [we[w] for w in words if w in vocab]
    y_test = [poss[i] for i, w in enumerate(words) if w in vocab]
    return X_test, y_test

In [16]:
def read_emb(path):
    we = {}
    with open(path, 'r') as f:
        next(f) 
        for l in f:
            sp = l.split()
            we[sp[0]] = [float(vec) for vec in sp[1:]]
    return we

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.85, random_state=42)

In [94]:
len(X_train)

457

In [20]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

clf = LogisticRegression(random_state=0)
# clf = SVC(C=100, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [100]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
# f1_score(y_test, y_pred, average='macro')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ADJ       0.53      0.52      0.53       328
         ADP       0.76      0.67      0.71       106
         ADV       0.46      0.58      0.51       281
         AUX       0.70      0.62      0.66        85
       CCONJ       0.70      0.84      0.77        37
         DET       0.75      0.62      0.68        96
        INTJ       0.00      0.00      0.00         2
        NOUN       0.71      0.64      0.67       991
         NUM       0.64      0.67      0.65        81
        PART       1.00      0.71      0.83        68
        PRON       0.71      0.24      0.36        71
       PROPN       0.50      0.50      0.50       141
        VERB       0.38      0.53      0.44       303

    accuracy                           0.59      2590
   macro avg       0.60      0.55      0.56      2590
weighted avg       0.62      0.59      0.60      2590



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import cross_val_score
clf = LogisticRegression(random_state=0)
cross_val_score(clf, X_test, y_test, cv=10, scoring='f1_macro')

In [18]:
mypath = '/home/com/burvec/embeddings/'

from os import listdir
from os.path import isfile, join
embs = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

In [6]:
len(vocab)

NameError: name 'vocab' is not defined

In [30]:
len(vocab)

0

In [34]:
from sklearn.metrics import f1_score
from sklearn.svm import SVC

for emb in embs:
    print(emb)
    if 'Untitled' in emb:
        continue
    
    we = read_emb(emb)    
    X_train, y_train = get_train(we, vocab)
    X_test, y_test = get_test(we, vocab)
#     clf = LogisticRegression(max_iter=200, random_state=0)
    clf = SVC(C=100, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f1_score(y_test, y_pred, average='macro'))

/home/com/burvec/embeddings/w2v.5.100
0.19513901098916622
/home/com/burvec/embeddings/svd.50
0.14881248675266964
/home/com/burvec/embeddings/w2v.5.500
0.197735222537948
/home/com/burvec/embeddings/glove.50
0.19511088417449907
/home/com/burvec/embeddings/svd.10.50
0.1708918422698234
/home/com/burvec/embeddings/glove.10.100
0.1885979514829305
/home/com/burvec/embeddings/w2v.2.100
0.1968095553258362
/home/com/burvec/embeddings/svd.100
0.16693943288343205
/home/com/burvec/embeddings/svd.2.50
0.1500085821206697
/home/com/burvec/embeddings/glove.100
0.19758098997869325
/home/com/burvec/embeddings/glove.10.50
0.19551654571767157
/home/com/burvec/embeddings/svd.2.100
0.1578127511784426
/home/com/burvec/embeddings/svd.10.500
0.18947394291080652
/home/com/burvec/embeddings/w2v.10.50
0.19551654571767157
/home/com/burvec/embeddings/w2v.2.50
0.19723372541647707
/home/com/burvec/embeddings/svd.5.500
0.18636214399881648
/home/com/burvec/embeddings/svd.2.500
0.18348967752800627
/home/com/burvec/embedd

KeyError: 'минии'

In [119]:
import torch
from torch import nn


In [129]:
loss = nn.BCELoss()
loss(torch.tensor([0.7]),torch.tensor([1.]))

tensor(0.3567)

In [120]:
torch.randn(3, requires_grad=True)

tensor([ 0.5334, -0.2985,  1.5134], requires_grad=True)

In [121]:
torch.empty(3).random_(2)

tensor([0., 1., 1.])

In [122]:
torch.tensor([0,0,1])

tensor([0, 0, 1])

In [139]:
import numpy

numpy.intersect1d(['a'],['a','b'],['c']).tolist()

['a']

In [136]:
len(vocab)

32

In [None]:
total = vocab[0]
for vec in vocab:
    print(len(vec))
    total = numpy.intersect1d(total, vec).tolist()

In [None]:
total

In [None]:
vocab[0]

In [2]:
from pathlib import Path
import json
# a = [[1,2,3],[4,5,6]]
# write
# Path("vocab.json").write_text(json.dumps(toal))
# read
# json.loads(Path("file.json").read_text())

In [31]:
vocab = json.loads(Path("vocab.json").read_text())

In [32]:
vocab[0]

'hадаhад'