# Import

In [1]:
# !pip install devon

In [2]:
import re
import json
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from devon.devon import FSMStemmer

In [3]:
STOPWORDS_ADD = ["a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa", "aaba"]

In [4]:
df_wiki_texts = pd.read_csv("TurkmenWikiTexts.csv")
stop_words = json.load(open("TurkmenStopWords.json")) + STOPWORDS_ADD

In [5]:
df_wiki_texts

Unnamed: 0,article
0,"Evenklar muxtor okrugiEvenklar muxtor okrugi, ..."
1,EvkaliptEvkalipt (Yeisa1urgi8) — mirtadoshlarg...
2,"EvklazEvklaz (yun. — yaxshi, yengil va — buzil..."
3,EvkommiyaEvkommiya (Yeisotppa) — evkommiyadosh...
4,Evolventa va evolyutaEvolventa va evolyuta (lo...
...,...
35217,Roswell (Georgia)Roswell AQShning Georgia shta...
35218,West Point (Georgia)West Point AQShning Georgi...
35219,Allentown (Georgia)Allentown AQShning Georgia ...
35220,Bayanavul (tuman)Bayanavul tumani — Pavlodar v...


# Clean Texts

In [6]:
def _split_upper(word: str) -> list:
    upper_word_list = re.split("(?=[A-Z])", word)
    if upper_word_list[0] == "":
        return upper_word_list[1:]
    return upper_word_list

def _check_word_len(word: str, min_len: int = 3, max_len: int = 30) -> bool:
    if len(word) >= min_len and len(word) <= max_len:
        return word
    return None

def _delete_apostrof(word: str) -> str:
    apostrofs = ["'", "ʻ", "ʼ"]
    if word in apostrofs:
        return ""
    if word[0] in apostrofs:
        word = word[1:]
    if word[-1] in apostrofs:
        word = word[:-1]
    return word

def _check_stop_words(word: str, stop_words: list) -> str:
    if word in stop_words:
        return ""
    return word

def _clean_pipline(words_list: np.ndarray) -> pd.Series:
    words_list = pd.Series(words_list)
    words_list = words_list.apply(_delete_apostrof)
    words_list = words_list.apply(lambda word: word.lower())
    words_list = words_list.apply(lambda word: _check_stop_words(word, stop_words)).dropna()
    words_list = words_list.apply(lambda word: FSMStemmer().stem(words=word)[0])
    words_list = words_list.apply(_check_word_len).dropna().reset_index(drop=True)
    words_list = words_list.apply(lambda word: _check_stop_words(word, stop_words)).dropna()
    return words_list

def clean_text(text: str, stop_words: list) -> str:
    splited_text = ' '.join(re.findall(r"[A-Za-z 'ʻʼ.]+", text)).split(sep='.')
    clean_text = ""
    for sentence in splited_text:
        sentence_array = np.array([], dtype=object)
        sentence_word_list = sentence.split()
        for word in sentence_word_list:
            splited_word_list = _split_upper(word)
            sentence_array = np.append(sentence_array, splited_word_list)
        if sentence_array.shape[0] == 0:
            continue
        clean_sentence = _clean_pipline(sentence_array)
        clean_sentence_text = ' '.join(clean_sentence.values)
        if clean_sentence_text == '':
            continue
        clean_text += ' '.join(clean_sentence_text.split())
        clean_text += '. '
    return clean_text

In [7]:
lambda_ = lambda text: clean_text(text, stop_words)
df_wiki_texts['clean_text'] = df_wiki_texts.loc[:, 'article'].apply(lambda_)
df_wiki_texts = df_wiki_texts.dropna()

In [8]:
df_wiki_texts

Unnamed: 0,article,clean_text
0,"Evenklar muxtor okrugiEvenklar muxtor okrugi, ...",evenk muxtor okrug evenk muxtor okrug evenkiya...
1,EvkaliptEvkalipt (Yeisa1urgi8) — mirtadoshlarg...,evkalipt evkalipt yeisa urg mirtadosh mansub y...
2,"EvklazEvklaz (yun. — yaxshi, yengil va — buzil...",evklaz evklaz yun. yengil buzilish maydalanish...
3,EvkommiyaEvkommiya (Yeisotppa) — evkommiyadosh...,evkommiya evkommiya yeisotppa evkommiyadosh oi...
4,Evolventa va evolyutaEvolventa va evolyuta (lo...,evolventa evolyuta evolventa evolyuta lot. yeu...
...,...,...
35217,Roswell (Georgia)Roswell AQShning Georgia shta...,roswell georgia roswell georgia shtat joylashg...
35218,West Point (Georgia)West Point AQShning Georgi...,west point georgia west point georgia shtat jo...
35219,Allentown (Georgia)Allentown AQShning Georgia ...,allentown georgia allentown georgia shtat joyl...
35220,Bayanavul (tuman)Bayanavul tumani — Pavlodar v...,bayanavul tuman bayanavul tuma pavlodar viloya...


In [9]:
def create_textshape_data(dataframe: pd.DataFrame, text_shape: list, column: str) -> pd.DataFrame:
    data_text_shape = pd.DataFrame(columns=['text_shape', 'pages_amount'])
    for shape in text_shape:
        lambda_ = lambda text: len(text.split()) > shape
        pages_amount = dataframe[dataframe[column].apply(lambda_)].shape[0]
        data_text_shape = data_text_shape.append({'text_shape': shape,
                                                  'pages_amount': pages_amount}, ignore_index=True)
    return data_text_shape

In [10]:
create_textshape_data(df_wiki_texts, [100, 130, 150, 300, 500, 1000], 'clean_text')

Unnamed: 0,text_shape,pages_amount
0,100,14939
1,130,10533
2,150,8600
3,300,3137
4,500,1385
5,1000,510


Since we need to get a corpus of ~10,000 documents, it is permissible that each document contains at least 130 words.

In [11]:
lambda_ = lambda text: len(text.split()) > 130
df_corpus = df_wiki_texts[df_wiki_texts['clean_text'].apply(lambda_)].reset_index(drop=True)

In [12]:
df_clean_corpus = df_corpus[['clean_text']]

In [13]:
df_clean_corpus

Unnamed: 0,clean_text
0,evenk muxtor okrug evenk muxtor okrug evenkiya...
1,evolyutsion taʼlimot evolyutsion taʼlimot evol...
2,evristika evristika yun. peshtzko izlayman top...
3,evfemiz evfemiz yun. yeirpegsha yumshoq ifodal...
4,egar egar otulov eshak xachir bugʻu ust urilad...
...,...
10528,favvora favvora biror manba suv biror tushib i...
10529,toʻliq mundsd ttir talaffuz byork gudmundsdott...
10530,maks gorkiy noml umumiy oʻrta maktab maks gork...
10531,bayanavul tuman bayanavul tuma pavlodar viloya...


In [14]:
def reshape_dataframe_by_dot_split(dataframe: pd.DataFrame, text_column: str) -> pd.DataFrame:
    result = pd.DataFrame(columns=['article_index', text_column])
    for i in tqdm(range(dataframe.shape[0])):
        lambda_ = lambda text: ' '.join(text.split())
        sentences = list(map(lambda_, dataframe.loc[i, text_column].split(sep='.')))
        for sentence in sentences:
            if sentence == '':
                continue
            result = result.append({'article_index': i,
                                    text_column: sentence}, ignore_index=True)
    return result

In [15]:
df_corpus_dot_split = reshape_dataframe_by_dot_split(df_clean_corpus, 'clean_text')

100%|█████████████████████████████████████| 10533/10533 [47:35<00:00,  3.69it/s]


In [16]:
df_corpus_dot_split

Unnamed: 0,article_index,clean_text
0,0,evenk muxtor okrug evenk muxtor okrug evenkiya...
1,0,yil dek
2,0,tashkil
3,0,sharqiy sibir joylashgan
4,0,maydo
...,...,...
426434,10532,viete formula albert girard viete avval topgan...
426435,10532,asr yashagan britan matematig charles hutton k...
426436,10532,hutton yozad yaʼ viete tenglama ildiz koeffits...
426437,10532,koʻpchilik viete formula rivojlanish qoʻshgan ...


In [17]:
df_corpus_dot_split.to_csv("TurkmenCleanCorpusDotSplit.csv", index=False)

In [18]:
df_uz_corpus = df_corpus_dot_split.groupby(by='article_index').agg({'clean_text': ' '.join}).reset_index()

In [19]:
df_uz_corpus

Unnamed: 0,article_index,clean_text
0,0,evenk muxtor okrug evenk muxtor okrug evenkiya...
1,1,evolyutsion taʼlimot evolyutsion taʼlimot evol...
2,2,evristika evristika yun peshtzko izlayman topa...
3,3,evfemiz evfemiz yun yeirpegsha yumshoq ifodala...
4,4,egar egar otulov eshak xachir bugʻu ust urilad...
...,...,...
10528,10528,favvora favvora biror manba suv biror tushib i...
10529,10529,toʻliq mundsd ttir talaffuz byork gudmundsdott...
10530,10530,maks gorkiy noml umumiy oʻrta maktab maks gork...
10531,10531,bayanavul tuman bayanavul tuma pavlodar viloya...


In [20]:
df_uz_corpus.to_csv("TurkmenCleanCorpus.csv", index=False)