In [None]:
import random
import numpy as np

def fix_random_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)

# Veri kümesini okuma

In [None]:
import re
import pandas as pd

In [None]:
ttc_df = pd.read_csv("data/ttc_3600.csv")

# Unicode karakterleri filtrele.
filter_unicodes = lambda text: re.sub(r'[^\x00-\x7fışğüöçİĞÜŞÇÖ]',r'', text)
ttc_df.text = ttc_df.text.apply(filter_unicodes)

# Fasttext formatına uygun hale getirme.
for i,row in ttc_df.iterrows():
    label = row["label"]
    text = " " + row["text"] 
    ttc_df["text"][i] = text
    label = "__label__" + row["label"]
    ttc_df["label"][i] = label

In [None]:
from augmentator import BertAugmentator

augmentation_config = {
    "model_name": "dbmdz/bert-base-turkish-cased",
    "frac": 0.2
}

bert_augmentator = BertAugmentator(augmentation_config=augmentation_config)

In [None]:
from sentencize import Sentencizer

# Veri kümesi genel olarak paragraflardan oluştuğu için cümle
# ayırma (sentence segmentation) uygulanması gerekiyor. 
sentencizer = Sentencizer()

# Eğitim ve test kümesine ayırma

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(ttc_df, test_size=0.2, random_state=42)

In [None]:
# Arttırımın uygulanacağı veri kümesini rastgele olarak örnekle.
to_augment_x_train = X_train.sample(frac=0.5, random_state=42)

# Veri arttırımını uygulama

In [None]:
from tqdm import tqdm

In [None]:
augmented_samples = []

for idx, sample in tqdm(to_augment_x_train.iterrows(), total=len(to_augment_x_train)):
    label = sample.values[0]
    text = sample.values[1]
    text_sentences = sentencizer.sentencize(text)
    augmented_sentences = []
    for sent in text_sentences[0]:
        augmented_sentences.append(bert_augmentator.augment(sent))

    augmented_text = " ".join(augmented_sentences)
    augmented_samples.append((label, augmented_text))

In [None]:
# Arttırılmış veriyi kaydet.
augmented_df = pd.DataFrame(augmented_samples, columns=["label", "text"])
augmented_df.to_csv("augmented_samples.csv", index=False)

In [None]:
# Veri arttırımı sonucunda elde edilen yeni örnekler BERT tokenizer'ına göre
# çıktı üretiyor. Bu yüzden eğitim ve test kümesine de bu işlemi uyguluyoruz.
bert_tokenize = lambda text: " ".join(bert_augmentator.pipeline.tokenizer.tokenize(text)).replace(" ##", "")

X_train.text = X_train.text.apply(bert_tokenize)
X_test.text = X_test.text.apply(bert_tokenize)

# Orijinal veri ile arttırılmış veriyi birleştiriyoruz.
X_train_augmented = pd.concat([
           X_train, augmented_df
])

In [None]:
def save_to_file(filename, df):
    with open(filename, "w") as f:
        for idx, sample in df.iterrows():
            write_text = sample.values[0] + ' ' + sample.values[1] + '\n'
            f.write(write_text)

In [None]:
# Verilerin son halini dosyaya kaydetme.
save_to_file(filename="X_tr.train", df=X_train)
save_to_file(filename="X_tr_augmented.train", df=X_train_augmented)
save_to_file(filename="X_test.test", df=X_test)

In [None]:
# Önceden eğitilmiş fastText embeddingleri
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz
# !gunzip cc.tr.300.vec.gz

# Sınıflandırma

In [None]:
import fasttext

In [None]:
fix_random_seeds(seed=42)
model = fasttext.train_supervised(input="X_tr.train", 
                                  lr=1.0, epoch=25, 
                                  wordNgrams=2, dim=300, 
                                  pretrainedVectors="cc.tr.300.vec")

In [None]:
fix_random_seeds(seed=42)
model_augmented = fasttext.train_supervised(input="X_tr_augmented.train", 
                                            lr=1.0, epoch=25, 
                                            wordNgrams=2, dim=300, 
                                            pretrainedVectors="cc.tr.300.vec")

# Sonuçlar ve Değerlendirme

In [None]:
from sklearn.metrics import classification_report

In [None]:
def predict(model, test_data):

    predictions = []
    for idx, sample in test_data.iterrows():
        prediction = model.predict(sample.values[1])
        predictions.append(prediction[0][0])
        
    return predictions

In [None]:
model_predictions = predict(model, X_test)
model_augmented_predictions = predict(model_augmented, X_test)

In [None]:
model1_report = classification_report(y_true, model_predictions, digits=3)
model_augmented_report = classification_report(y_true, model_augmented_predictions, digits=3)

In [None]:
print(model1_report)

In [None]:
print(model_augmented_report)