In [1]:
import pandas as pd
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import os
import torch

In [2]:
kfold = 10
train_folds_df = pd.read_csv("reason_filtered_file.csv", sep="|", encoding="utf-8")
train_folds_df.head()

Unnamed: 0,tweet,reason
0,ulan wifiye baglıyım ben baglıyken turkcell in...,fatura
1,dk gb internet mb kasar turkcell duser oc cocu...,network
2,turkcell superonline reklamı kötü bir reklam g...,reklam
3,turkcell pahalı,fatura
4,turkcell kasta internetin cekmiyor,network


In [3]:
if 'kfold' not in train_folds_df.columns:
    kf = KFold(n_splits=kfold, shuffle=True, random_state=42)
    train_folds_df['kfold'] = -1
    for fold, (train_index, val_index) in enumerate(kf.split(train_folds_df)):
        train_folds_df.loc[val_index, 'kfold'] = fold

In [4]:
train_folds_df.head()

Unnamed: 0,tweet,reason,kfold
0,ulan wifiye baglıyım ben baglıyken turkcell in...,fatura,1
1,dk gb internet mb kasar turkcell duser oc cocu...,network,8
2,turkcell superonline reklamı kötü bir reklam g...,reklam,5
3,turkcell pahalı,fatura,0
4,turkcell kasta internetin cekmiyor,network,9


In [5]:
train_folds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10965 entries, 0 to 10964
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   10965 non-null  object
 1   reason  10965 non-null  object
 2   kfold   10965 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 257.1+ KB


In [6]:
le = LabelEncoder()
le.fit(train_folds_df['reason'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping = {k: int(v) for k, v in le_name_mapping.items()}
print(le_name_mapping)
train_folds_df['labels'] = train_folds_df['reason'].apply(lambda x: le_name_mapping[x])

{'bayi': 0, 'diğer': 1, 'fatura': 2, 'kampanya': 3, 'kurumsal': 4, 'kvkk': 5, 'mnp': 6, 'network': 7, 'reklam': 8, 'uygulama': 9, 'çağrı merkezi yetkinlik': 10, 'ürün': 11}


In [7]:
train_folds_df.head(10)

Unnamed: 0,tweet,reason,kfold,labels
0,ulan wifiye baglıyım ben baglıyken turkcell in...,fatura,1,2
1,dk gb internet mb kasar turkcell duser oc cocu...,network,8,7
2,turkcell superonline reklamı kötü bir reklam g...,reklam,5,8
3,turkcell pahalı,fatura,0,2
4,turkcell kasta internetin cekmiyor,network,9,7
5,turkcellin allah belası versin demek isterdim ...,network,9,7
6,turkcell yonetimini eline geciren akp hukumeti...,diğer,7,1
7,bayramdan vodafone geciyorum turkcell kazıkcı ...,mnp,7,6
8,of evin kosesinden turkcell geciyor kazıklamal...,network,1,7
9,hay turkcell cekim gucunu nalet olsun,network,9,7


In [8]:
for fold in range(kfold):
    # Split data into train and validation sets
    train_df = train_folds_df[train_folds_df['kfold'] != fold]
    validation_df = train_folds_df[train_folds_df['kfold'] == fold]

    # Prepare the data for simpletransformers
    train_df = pd.DataFrame({
        'text': train_df['tweet'],  # Replace 'your_text_column' with the actual column name for text
        'labels': train_df['labels']
    })
    validation_df = pd.DataFrame({
        'text': validation_df['tweet'],  # Replace 'your_text_column' with the actual column name for text
        'labels': validation_df['labels']
    })

    # Initialize the ClassificationModel
    model = ClassificationModel('bert',
                                'dbmdz/bert-base-turkish-uncased',
                                num_labels=len(le_name_mapping.keys()),
                                use_cuda=True,  # Use CUDA
                                args={'reprocess_input_data': True,
                                      'overwrite_output_dir': True,
                                      'num_train_epochs': 20,
                                      'train_batch_size': 64,
                                      'fp16': False,
                                      'save_model_every_epoch': True,
                                      'save_eval_checkpoints': False,
                                      'output_dir': f'bert_model_fold_merge_{fold}',
                                      'save_steps': 0 })

Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTrai

In [9]:
model.train_model(train_df)

  0%|          | 0/9869 [00:00<?, ?it/s]

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 0 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 1 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 2 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 3 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 4 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 5 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 6 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 7 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 8 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 9 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 10 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 11 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 12 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 13 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 14 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 15 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 16 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 17 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 18 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

Running Epoch 19 of 20:   0%|          | 0/155 [00:00<?, ?it/s]

(3100, 0.19960110499285116)

In [10]:
result, model_outputs, wrong_predictions = model.eval_model(validation_df)

  0%|          | 0/1096 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/137 [00:00<?, ?it/s]

In [11]:
predictions = model_outputs.argmax(axis=1)
actuals = validation_df['labels'].values

# Print classification report
print(metrics.classification_report(actuals, predictions, digits=3))

              precision    recall  f1-score   support

           0      0.714     0.714     0.714         7
           1      0.920     0.903     0.911       165
           2      0.899     0.902     0.901       246
           3      0.414     0.545     0.471        22
           4      1.000     0.500     0.667         2
           6      0.625     0.429     0.508        35
           7      0.842     0.867     0.854       196
           8      0.823     0.879     0.850        58
           9      0.854     0.843     0.849       153
          10      0.791     0.791     0.791       129
          11      0.976     0.964     0.970        83

    accuracy                          0.854      1096
   macro avg      0.805     0.758     0.771      1096
weighted avg      0.855     0.854     0.854      1096



In [12]:
le_name_mapping

{'bayi': 0,
 'diğer': 1,
 'fatura': 2,
 'kampanya': 3,
 'kurumsal': 4,
 'kvkk': 5,
 'mnp': 6,
 'network': 7,
 'reklam': 8,
 'uygulama': 9,
 'çağrı merkezi yetkinlik': 10,
 'ürün': 11}