# Import

In [1]:
import re
import glob
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from uuid import uuid4

In [2]:
# nltk.download('punkt')

# Reading texts from Turkmen Wiki

In [3]:
def split_keep_sep(string: str, sep: str) -> list:
    cleaned = []
    string = re.split('(%s)' % re.escape(sep), string)
    for _ in string:
        if _ != '' and _ != sep:
            cleaned.append(sep + _)
    return cleaned

def remove_html_tags(text: str) -> str:
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_special_chars(text: str, char_list: list) -> str:
    for char in char_list:
        text = text.replace(char, '')
    return text.replace(u'\xa0', u' ')

def process_wiki_file(wiki_file: str) -> pd.DataFrame:
    chars = ['\n']
    with open(wiki_file, encoding='utf-8') as f:
        content = f.read()
    articles = split_keep_sep(content, '<doc id=')
    dataframe = pd.DataFrame(columns=['article'])
    for num, article in enumerate(articles):
        article = remove_special_chars(remove_html_tags(article), chars)
        if len(article.split()) < 50:
            continue
        dataframe = dataframe.append({'article': article}, ignore_index=True)
    return dataframe

In [4]:
wiki_files = []
for filename in glob.iglob("TurkmenWiki/*/*"):
    wiki_files.append(filename)

In [5]:
dataframe = pd.DataFrame()
for file_name in tqdm(wiki_files):
    dataframe_file = process_wiki_file(file_name)
    dataframe = pd.concat([dataframe, dataframe_file])
dataframe.reset_index(drop=True, inplace=True)

100%|█████████████████████████████████████████| 133/133 [00:39<00:00,  3.34it/s]


In [6]:
dataframe.to_csv("TurkmenWikiTexts.csv", index=False)

# Check

In [7]:
dataframe = pd.read_csv("TurkmenWikiTexts.csv")

In [8]:
dataframe

Unnamed: 0,article
0,"Evenklar muxtor okrugiEvenklar muxtor okrugi, ..."
1,EvkaliptEvkalipt (Yeisa1urgi8) — mirtadoshlarg...
2,"EvklazEvklaz (yun. — yaxshi, yengil va — buzil..."
3,EvkommiyaEvkommiya (Yeisotppa) — evkommiyadosh...
4,Evolventa va evolyutaEvolventa va evolyuta (lo...
...,...
35217,Roswell (Georgia)Roswell AQShning Georgia shta...
35218,West Point (Georgia)West Point AQShning Georgi...
35219,Allentown (Georgia)Allentown AQShning Georgia ...
35220,Bayanavul (tuman)Bayanavul tumani — Pavlodar v...
