In [1]:
import re
import requests
import pandas as pd

In [2]:
ud = "https://raw.githubusercontent.com/UniversalDependencies/UD_Yakut-YKTDT/dev/sah_yktdt-ud-test.conllu"
f = requests.get(ud)
file = f.text.split("\n")
#file

In [3]:
rege = r"^\d+\t"
lst = []

for line in file:
    if re.match(rege, line):
        lst.append(line)

In [4]:
df = pd.DataFrame([sub.split("\t") for sub in lst])
colnames = ["ID", "WORD", "LEMMA", "POS", "SPOS", "FEATURES", "6", "7", "8", "9"]
df.columns = colnames

In [5]:
df1 = df.drop(["ID", "SPOS", "6", "7", "8", "9", ], axis=1)
df1['WORD'] = df1['WORD'].str.lower()
df1['LEMMA'] = df1['LEMMA'].str.lower()

In [6]:
df1.head()

Unnamed: 0,WORD,LEMMA,POS,FEATURES
0,кинилэр,кини,PRON,Case=Nom|Number=Plur|Person=3|PronType=Prs
1,кэллилэр,кэл,VERB,Number=Plur|Person=3|Tense=NearPast
2,кинигэлэри,кинигэ,NOUN,Case=Acc|Number=Plur
3,күн,күн,NOUN,Case=Nom|Number=Sing
4,аайы,аайы,DET,_


In [7]:
print("Number of unique lemmas: " + str(len(df1.WORD.unique())))

Number of unique lemmas: 673


In [8]:
print("Number of unique lemmas: " + str(len(df1.LEMMA.unique())))

Number of unique lemmas: 428


In [9]:
df1 = df1[~df1["LEMMA"].str.contains("_")]
df1
df1.sort_values(by=['POS'])

Unnamed: 0,WORD,LEMMA,POS,FEATURES
51,кыра,кыра,ADJ,_
649,эмис,эмис,ADJ,_
525,элбэх,элбэх,ADJ,_
1636,кыра,кыра,ADJ,_
270,хара,хара,ADJ,_
...,...,...,...,...
1264,маныаҕыҥ,манаа,VERB,Number=Sing|Person=2|Tense=Fut
1260,кэрдиэхтэрэ,кэр,VERB,Number=Plur|Person=3|Tense=Fut
1256,уураллар,уур,VERB,Number=Plur|Person=3|Tense=Pres
1301,үөрэнэр,үөрэн,VERB,Number=Sing|Person=3|Tense=Pres


In [10]:
pos = sorted(df1["POS"].unique())
pos

['ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'VERB']

In [11]:
with open ("lexicon_ud.txt", "a", encoding="utf-8") as w:
    
    w.write("LEXICON Root" + "\n")
    for p in pos:
        w.write("\t" + p + ";" + "\n")
    w.write("\n\n")

    for p in pos:
        w.write("\n")
        w.write("LEXICON " + p + "\n")
        for i in df1[df1["POS"].str.contains(p)].LEMMA.unique():
            w.write(i + "\t" + p.lower() + "first;" + "\n")
        w.write("\n\n")
        w.write("LEXICON " + p.lower() + "first" + "\n")
        w.write("\t" + "#;" + "\n")
        w.write("\n\n")
    

In [12]:
#df_noun = df1[df1["POS"].str.contains("NOUN")]
#df2 = df1.sort_values(by=['POS']).drop_duplicates()
#df2


In [13]:
#alls = df2.groupby('POS').count()
#alls

In [14]:
#df_noun = df1[df1["POS"].str.contains("NOUN")]
df2 = df1.sort_values(by=['POS']).drop_duplicates()
df2

Unnamed: 0,WORD,LEMMA,POS,FEATURES
51,кыра,кыра,ADJ,_
649,эмис,эмис,ADJ,_
525,элбэх,элбэх,ADJ,_
270,хара,хара,ADJ,_
1359,эрдиилээх,эрдии,ADJ,Poss=Yes
...,...,...,...,...
1278,ыйыыбыт,ыйаа,VERB,Number=Plur|Person=1|Tense=Pres
1264,маныаҕыҥ,манаа,VERB,Number=Sing|Person=2|Tense=Fut
1260,кэрдиэхтэрэ,кэр,VERB,Number=Plur|Person=3|Tense=Fut
1256,уураллар,уур,VERB,Number=Plur|Person=3|Tense=Pres


In [15]:
df_noun = df2[df2["POS"].str.contains("VERB")]
len(df_noun["LEMMA"].unique())

82

In [16]:
from collections import Counter
Counter(df1["POS"]).most_common()

[('NOUN', 542),
 ('VERB', 327),
 ('PRON', 204),
 ('PUNCT', 197),
 ('ADV', 120),
 ('ADJ', 91),
 ('NUM', 54),
 ('CCONJ', 44),
 ('AUX', 43),
 ('DET', 41),
 ('PART', 35),
 ('PROPN', 33),
 ('INTJ', 8),
 ('ADP', 1)]

In [17]:
### Removing punctuation

In [18]:
df2 = df2[~df2["POS"].str.contains("PUNCT")]
df2

Unnamed: 0,WORD,LEMMA,POS,FEATURES
51,кыра,кыра,ADJ,_
649,эмис,эмис,ADJ,_
525,элбэх,элбэх,ADJ,_
270,хара,хара,ADJ,_
1359,эрдиилээх,эрдии,ADJ,Poss=Yes
...,...,...,...,...
1278,ыйыыбыт,ыйаа,VERB,Number=Plur|Person=1|Tense=Pres
1264,маныаҕыҥ,манаа,VERB,Number=Sing|Person=2|Tense=Fut
1260,кэрдиэхтэрэ,кэр,VERB,Number=Plur|Person=3|Tense=Fut
1256,уураллар,уур,VERB,Number=Plur|Person=3|Tense=Pres


In [19]:
df2["POS"] = df2["POS"] + "|" + df2["FEATURES"] 
df2["POS"] = df2["POS"].str.replace("|_", "", regex=False)
df2["POS"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["POS"] = df2["POS"] + "|" + df2["FEATURES"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["POS"] = df2["POS"].str.replace("|_", "", regex=False)


51                                              ADJ
649                                             ADJ
525                                             ADJ
270                                             ADJ
1359                                   ADJ|Poss=Yes
                           ...                     
1278           VERB|Number=Plur|Person=1|Tense=Pres
1264            VERB|Number=Sing|Person=2|Tense=Fut
1260            VERB|Number=Plur|Person=3|Tense=Fut
1256           VERB|Number=Plur|Person=3|Tense=Pres
357     VERB|Number=Sing|Person=3|Tense=PastResultI
Name: POS, Length: 669, dtype: object

In [20]:
df2["POS"] = df2["POS"].str.replace("|", ">|<")

  df2["POS"] = df2["POS"].str.replace("|", ">|<")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["POS"] = df2["POS"].str.replace("|", ">|<")


In [21]:
df2["POS"] = df2["POS"].str.replace(r"(^)", "<", regex=True)
df2["POS"] = df2["POS"].str.replace(r"($)", ">", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["POS"] = df2["POS"].str.replace(r"(^)", "<", regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["POS"] = df2["POS"].str.replace(r"($)", ">", regex=True)


In [22]:
df2

Unnamed: 0,WORD,LEMMA,POS,FEATURES
51,кыра,кыра,<ADJ>,_
649,эмис,эмис,<ADJ>,_
525,элбэх,элбэх,<ADJ>,_
270,хара,хара,<ADJ>,_
1359,эрдиилээх,эрдии,<ADJ>|<Poss=Yes>,Poss=Yes
...,...,...,...,...
1278,ыйыыбыт,ыйаа,<VERB>|<Number=Plur>|<Person=1>|<Tense=Pres>,Number=Plur|Person=1|Tense=Pres
1264,маныаҕыҥ,манаа,<VERB>|<Number=Sing>|<Person=2>|<Tense=Fut>,Number=Sing|Person=2|Tense=Fut
1260,кэрдиэхтэрэ,кэр,<VERB>|<Number=Plur>|<Person=3>|<Tense=Fut>,Number=Plur|Person=3|Tense=Fut
1256,уураллар,уур,<VERB>|<Number=Plur>|<Person=3>|<Tense=Pres>,Number=Plur|Person=3|Tense=Pres


In [23]:
df2 = df2.drop("FEATURES", axis=1)

In [24]:
df2 = df2.sort_values(by=['POS'])#.head(10)
df2

Unnamed: 0,WORD,LEMMA,POS
51,кыра,кыра,<ADJ>
481,бөҕө,бөҕө,<ADJ>
1299,үрдүкү,үрдүкү,<ADJ>
104,кылгас,кылгас,<ADJ>
1504,үрдүк,үрдүк,<ADJ>
...,...,...,...
219,ыҥырар,ыҥыр,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
627,гынар,гын,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
689,устар,уст,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
353,күлэ,күл,<VERB>|<Tense=Pres>|<VerbForm=Trans>


In [25]:
df2 = df2[["LEMMA", "WORD", "POS"]]

In [26]:
len(df2.LEMMA.unique())

421

In [27]:
len(df2.POS.unique())

101

In [28]:
df2#.LEMM.unique()

Unnamed: 0,LEMMA,WORD,POS
51,кыра,кыра,<ADJ>
481,бөҕө,бөҕө,<ADJ>
1299,үрдүкү,үрдүкү,<ADJ>
104,кылгас,кылгас,<ADJ>
1504,үрдүк,үрдүк,<ADJ>
...,...,...,...
219,ыҥыр,ыҥырар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
627,гын,гынар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
689,уст,устар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
353,күл,күлэ,<VERB>|<Tense=Pres>|<VerbForm=Trans>


In [29]:
nouns = ["NOUN", "PROPN", "NUM", "PRON", "DET"]

In [30]:
df_nouns = df2[df2["POS"].str.contains('|'.join(nouns))]
df_verbs = df2[~df2["POS"].str.contains('|'.join(nouns))]

In [31]:
import re
df_nouns.head()

Unnamed: 0,LEMMA,WORD,POS
690,бу,бу,<DET>
4,аайы,аайы,<DET>
454,бу,маны,<DET>|<Case=Acc>
1123,яблоко,яблокаттан,<NOUN>|<Case=Abl>|<Number=Sing>
1432,ыстаал,ыстаалтан,<NOUN>|<Case=Abl>|<Number=Sing>


In [32]:
df_nouns['POS'] = (df_nouns['POS']#.add('|')
            .str.replace(r'(<NOUN>)(\|)(.+)(\|)(<Number=.*)', r'\1\5\3', regex=True)
              )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nouns['POS'] = (df_nouns['POS']#.add('|')


In [33]:
df_nouns['POS'] = df_nouns['POS'].str.replace("><", ">|<")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nouns['POS'] = df_nouns['POS'].str.replace("><", ">|<")


In [34]:
df_nouns

Unnamed: 0,LEMMA,WORD,POS
690,бу,бу,<DET>
4,аайы,аайы,<DET>
454,бу,маны,<DET>|<Case=Acc>
1123,яблоко,яблокаттан,<NOUN>|<Number=Sing>|<Case=Abl>
1432,ыстаал,ыстаалтан,<NOUN>|<Number=Sing>|<Case=Abl>
...,...,...,...
145,петров,петровка,<PROPN>|<Case=Dat>
919,айтал,айталга,<PROPN>|<Case=Dat>
150,дьокуускай,дьокуускайга,<PROPN>|<Case=Dat>
154,москва,москваҕа,<PROPN>|<Case=Dat>


In [35]:
df_full = df_nouns.append(df_verbs)

  df_full = df_nouns.append(df_verbs)


In [36]:
df_full

Unnamed: 0,LEMMA,WORD,POS
690,бу,бу,<DET>
4,аайы,аайы,<DET>
454,бу,маны,<DET>|<Case=Acc>
1123,яблоко,яблокаттан,<NOUN>|<Number=Sing>|<Case=Abl>
1432,ыстаал,ыстаалтан,<NOUN>|<Number=Sing>|<Case=Abl>
...,...,...,...
219,ыҥыр,ыҥырар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
627,гын,гынар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
689,уст,устар,<VERB>|<Number=Sing>|<Person=3>|<Tense=Pres>
353,күл,күлэ,<VERB>|<Tense=Pres>|<VerbForm=Trans>


In [37]:
df_full.to_csv("../FST/eval_full/sah_full.dev", sep="\t", encoding="utf-8", index = False, header=False)
df_full.to_csv("../FST/eval_full/sah_full.train", sep="\t", encoding="utf-8", index = False, header=False)
df_nouns.to_csv("../FST/eval_noun/sah_nouns.dev", sep="\t", encoding="utf-8", index = False, header=False)
df_verbs.to_csv("../FST/eval_verb/sah_verbs.dev", sep="\t", encoding="utf-8", index = False, header=False)