In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("conll/dataset.conll", sep="\t", names=["Word","tag"]).ffill()
data

In [None]:
data.tag.value_counts()

In [None]:
data.tag.unique()

In [None]:
data.tag.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(data, test_size=0.10)

In [None]:
len(train_df)

In [None]:
import tqdm
train_df["Sent_ID"] = ""
index = 0
for x, row in tqdm.tqdm(train_df.iterrows()):
    train_df.at[x,'Sent_ID'] = index
    if str(row["Word"]).lstrip().rstrip() == ".":
        index+=1



In [None]:
val_df["Sent_ID"] = ""
index = 0
for x, row in tqdm.tqdm(val_df.iterrows()):
    val_df.at[x,'Sent_ID'] = index
    if str(row["Word"]).lstrip().rstrip() == ".":
        index+=1
val_df

In [None]:
class GetSentence(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
     
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("Sent_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = GetSentence(train_df)
v_getter = GetSentence(val_df)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
v_sentences = [[word[0] for word in sentence] for sentence in v_getter.sentences]

In [None]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

v_labels = [[s[1] for s in sentence] for sentence in v_getter.sentences]
v_labels[0]

In [None]:
tag_values = list(set(train_df["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}


In [None]:
#Adding Padding at the end of each sentence
v_tag_values = list(set(val_df["tag"].values))
v_tag_values.append("PAD")
v_tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
tag_values

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig, AlbertTokenizer, AlbertConfig, RobertaConfig, RobertaTokenizer, AutoTokenizer

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

In [None]:
MAX_LEN = 256 #sikayet var verilerine göre
bs = 32


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

In [None]:
torch.cuda.get_device_name(0)

In [None]:
model_name = "savasy/bert-base-turkish-ner-cased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize et
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Tokenized kelime listesine ekle
        tokenized_sentence.extend(tokenized_word)

        # Etiketi listeye ekle
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels 

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]
v_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(v_sentences, v_labels)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

v_tokenized_texts = [token_label_pair[0] for token_label_pair in v_tokenized_texts_and_labels]
v_labels = [token_label_pair[1] for token_label_pair in v_tokenized_texts_and_labels]


In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

v_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in v_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
len(labels)

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

v_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in v_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
# Dikkat maskelerini ayarla
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

v_attention_masks = [[float(i != 0.0) for i in ii] for ii in v_input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
tr_inputs = input_ids
val_inputs = v_input_ids
tr_tags = tags
val_tags = v_tags
tr_masks = attention_masks
val_masks = v_attention_masks

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
# Verileri karıştır
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


#Fine Tuning işlemine başla

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW, AlbertForTokenClassification, RobertaForTokenClassification, AutoModelForTokenClassification, AutoConfig

transformers.__version__

In [None]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(tag2idx)
config.output_attentions = False
config.output_hidden_states = False
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True
)

In [None]:
model.cuda();

In [None]:

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
# Parametreleri tanımla (epochs ve learning rate)
from transformers import get_linear_schedule_with_warmup

epochs = 6
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:

loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    
    model.train()
    
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()

   
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    
    loss_values.append(avg_train_loss) # Plot için kayıp verilerini al

    
    model.eval() # Her bir eğitim adımından sonra değerlendirme yap
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        
        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l
                                    if tag_values[l_i] != "PAD"]
    print("Validation F1-Score: {}".format(f1_score([pred_tags], [valid_tags])))
    print()

In [None]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(validation_loss_values, 'r-o', label="validation loss")

plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
#gerçek test :
sentence = "Avea neden hep Turkcell in reklamlarına diss atıyo"
sentence = "Bir @YouTube oynatma listesine video ekledim: http://youtu.be/jJ1u4CDu3LU?a  Abazan Videolar #2 - Turkcell BiP Ön Bakış"
sentence = "Turkcell Paycell faturama 6 aydır sürekli Paycell ile ilgili ücret yansıtıyor. Paycell iptal ediyorum bir ay sonra bakıyorum yine ücret yansıtılmış. Bu zamana kadarki zararım 1 fatura kadar ücret oldu. Yemin ediyorum bizleri hiç takmıyorlar. Para için yapıyorlar. 10 TL Google Play harcaması için 30 TL hizmet bedeli alıyorlar. Turkcell fatura dönemim bitsin yemin ediyorum başka operatöre geçeceğim. Asla bizi dinlemiyorlar. Paycell iptal edilmiyor. Her ay kesilen paraları da iade etmiyorlar. Paycell iptal edilmiyor. Bu yüzden ve kesinlikle iptal edeceğim Turkcell"
sentence = "Turkcell internetimin hızını düşürünce hayat çok daha kötü bir hal alıyor bunu anlamış bulunuyorum."
sentence = "gnçtrkcll üyeleri için kampanya sürüyor. üyeyseniz bu fırsatı kaçırmayın.#sanakapakolsun gnctrkcll"
sentence = "Turkcell Paycell uygulamasına bugün üye oldum. Nakit avans kısmında 1000 lira banka hesabına aktarmak istedim ve böyle bir işlem artık yapılmıyormuş. Yine de benden 85 lira işlem ücreti kesildi. Müşteri hizmetlerini aradım ve işlemleri iptal ettirip bu uygulamadan sildirdim kendimi. Ama yine de 85 lira faturama yansıyacakmış. Müşteri temsilcisi böyle dedi."
sentence = "Türk Telekom tanıdığım en iyi operatörlerden bir tanesidir. Türkcell ise yeterli seviyede değil."
sentence = "daha yeni kontör yükledim bu turkcell vakumluyor mu napıyor kontörleri anlamadım"

In [None]:
tokenized_sentence = tokenizer.encode(sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()

In [None]:
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
# Çıktıları ana uygulamaya göre belirlenen formatta düzenle

import json

process_tag = ['OPERATOR', 'PRODUCT', 'HIZMET', 'APP', 'PACKAGE']
custom_tags = ["[CLS]", "[SEP]"]
outputs = []
entity_index = 0
for indx, (token, label) in enumerate(zip(new_tokens, new_labels)):
    if not token in custom_tags:
        if label in process_tag:
            if new_labels[indx-1] in process_tag:
                entity_index = entity_index
            else:
                entity_index +=1
            
            outputs.append({"entitiy": "OTHER" if label in ["PAD", "O"] else label, "word": token, "entityindex":entity_index,  "wordindex":indx-1} )
        else:

            outputs.append({"entitiy": "OTHER" if label in ["PAD", "O"] else label, "word": token, "entityindex":-1,  "wordindex":indx-1} )

print(f"output:\n{outputs}")
print(f"formatlı:\n{json.dumps(outputs, ensure_ascii= False)}")
