# Cyrillic Dataset of Turkic languages spoken in Russia and in former USSR

https://www.cambridge.org/core/books/abs/communicating-with-asia/russian-and-turkic-languages-in-central-asia/D6537B03A7F97DF870BBDF21A2536A02

## The Dataset is a part of the Leipzig Corpora (Wikipedia) Collection

D. Goldhahn, T. Eckart & U. Quasthoff: Building Large Monolingual Dictionaries at the Leipzig Corpora Collection: From 100 to 200 Languages.
In: Proceedings of the 8th International Language Resources and Evaluation (LREC'12), 2012 

In [1]:
import glob
import os
import re
import pandas as pd
from tqdm import tqdm

In [2]:
folder_path = "/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/"
file_list = glob.glob(os.path.join(folder_path, "**/*sentences.txt"), recursive=True)
file_list

['/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/bak_wikipedia_2021_100K/bak_wikipedia_2021_100K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/sah_wikipedia_2021_100K/sah_wikipedia_2021_100K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/krc_wikipedia_2016_10K/krc_wikipedia_2016_10K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/tyv_wikipedia_2021_10K/tyv_wikipedia_2021_10K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/kir_wikipedia_2021_100K/kir_wikipedia_2021_100K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/tat_wikipedia_2021_100K/tat_wikipedia_2021_100K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/kaz_wikipedia_2021_100K/kaz_wikipedia_2021_100K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/TurkRusClassification/chv_wikipedia_2021_10K/chv_wikipedia_2021_10K-sentences.txt',
 '/Users/tatiana/Desktop/Codes/NLP/Tur

In [3]:
dfs = []
for file_path in file_list:
    with open(file_path, 'r') as f:
        file_name = file_path.split('/')[-1]
        language = file_name.split("_")[0]

        for i, line in tqdm(enumerate(f), desc=f"Processing {file_name}"):
            sentence = line.strip().split("\t")
            dfs.append(pd.DataFrame({'index': i, 'text': sentence[1], 'label': language}, index=[0]))


Processing bak_wikipedia_2021_100K-sentences.txt: 100000it [00:13, 7232.92it/s]
Processing sah_wikipedia_2021_100K-sentences.txt: 100000it [00:13, 7259.85it/s]
Processing krc_wikipedia_2016_10K-sentences.txt: 10000it [00:01, 7718.51it/s]
Processing tyv_wikipedia_2021_10K-sentences.txt: 10000it [00:01, 7819.67it/s]
Processing kir_wikipedia_2021_100K-sentences.txt: 100000it [00:13, 7149.12it/s]
Processing tat_wikipedia_2021_100K-sentences.txt: 100000it [00:13, 7399.31it/s]
Processing kaz_wikipedia_2021_100K-sentences.txt: 100000it [00:13, 7251.89it/s]
Processing chv_wikipedia_2021_10K-sentences.txt: 10000it [00:01, 7825.48it/s]
Processing rus_wikipedia_2021_100K-sentences.txt: 100000it [00:14, 7071.92it/s]


In [4]:
df = pd.concat(dfs, ignore_index=True)

In [6]:
df['label'].value_counts()

bak    100000
sah    100000
kir    100000
tat    100000
kaz    100000
rus    100000
krc     10000
tyv     10000
chv     10000
Name: label, dtype: int64

In [7]:
df

Unnamed: 0,index,text,label
0,0,"""1001 кисә""нең Ханна һөйләгән хикәйәләре ингән...",bak
1,1,«112-се Башҡорт атлы дивизияһы яугире Шафиҡов ...,bak
2,2,«…11 февралдә 163 һәм 219 уҡсылар полкы Нарва ...,bak
3,3,"«121», Дмитрий Пригов тексына ирҙәр тауышы һәм...",bak
4,4,«12» фильмы өсөн Венеция кинофестиваленең (200...,bak
...,...,...,...
629995,99995,"· 弄啥类，что за вещь, что делать—意为“干什么呢”。",rus
629996,99996,"服部 安信) — друг Ямато, популярный у девушек лове...",rus
629997,99997,"» (カミュなんて知らない, 2005), демонстрировавшийся в ра...",rus
629998,99998,"» (系辞传 xìcí zhuàn), который входит в состав та...",rus


In [8]:
df = df[["text", "label"]]

In [9]:
df.to_csv("cyrillic_turkic_langs.csv", encoding="utf-8", index=False)

## Equally splitting the corpus for train, test, and valid

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("cyrillic_turkic_langs.csv", encoding="utf-8")

df_krc_tyv = df[df["label"].isin(["krc","tyv","chv"])]
df = df[~df["label"].isin(["krc","tyv","chv"])]
df

Unnamed: 0,text,label
0,"""1001 кисә""нең Ханна һөйләгән хикәйәләре ингән...",bak
1,«112-се Башҡорт атлы дивизияһы яугире Шафиҡов ...,bak
2,«…11 февралдә 163 һәм 219 уҡсылар полкы Нарва ...,bak
3,"«121», Дмитрий Пригов тексына ирҙәр тауышы һәм...",bak
4,«12» фильмы өсөн Венеция кинофестиваленең (200...,bak
...,...,...
629995,"· 弄啥类，что за вещь, что делать—意为“干什么呢”。",rus
629996,"服部 安信) — друг Ямато, популярный у девушек лове...",rus
629997,"» (カミュなんて知らない, 2005), демонстрировавшийся в ра...",rus
629998,"» (系辞传 xìcí zhuàn), который входит в состав та...",rus


In [25]:
languages_krc_tyv = df_krc_tyv.groupby('label')
lang_krc_tyv_dfs = [lang_df_kt for _, lang_df_kt in languages_krc_tyv]
#lang_krc_tyv_dfs

In [26]:
languages = df.groupby('label')
lang_full_dfs = [lang_df for _, lang_df in languages]
#lang_full_dfs

In [27]:
train_dfs, test_dfs, valid_dfs = [], [], []

In [28]:
for lang_df in lang_krc_tyv_dfs:
    train_df, test_valid_df = train_test_split(lang_df, test_size=0.2, random_state=42)
    test_df, valid_df = train_test_split(test_valid_df, test_size=0.5, random_state=42)
    train_dfs.append(train_df)
    test_dfs.append(test_df)
    valid_dfs.append(valid_df)

In [29]:
for lang_df in lang_full_dfs:
    lang_df = lang_df.sample(n=10000, random_state=42)
    train_df, test_valid_df = train_test_split(lang_df, test_size=0.2, random_state=42)
    test_df, valid_df = train_test_split(test_valid_df, test_size=0.5, random_state=42)
    #print(train_df)
    
    train_dfs.append(train_df)
    test_dfs.append(test_df)
    valid_dfs.append(valid_df)

In [16]:
train_dfs = pd.concat(train_dfs)
train_dfs = train_dfs.sample(frac=1).reset_index(drop=True)
train_dfs = train_dfs.reset_index(drop=True)

test_dfs = pd.concat(test_dfs)
test_dfs = test_dfs.sample(frac=1).reset_index(drop=True)
test_dfs = test_dfs.reset_index(drop=True)

valid_dfs = pd.concat(valid_dfs)
valid_dfs = valid_dfs.sample(frac=1).reset_index(drop=True)
valid_dfs = valid_dfs.reset_index(drop=True)

In [20]:
train_dfs.to_csv("train.csv", encoding="utf-8", index=False)
test_dfs.to_csv("test.csv", encoding="utf-8", index=False)
valid_dfs.to_csv("valid.csv", encoding="utf-8", index=False)