Creating API key

In [1]:
!pip3 install -q kaggle

In [None]:
!pip3 install torch_geometric

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, TransformerConv, GATv2Conv, ChebConv, ResGatedGraphConv
import torch_geometric.nn as pyg_nn
from torch_geometric.loader import DataLoader

In [4]:
from google.colab import files
files.upload() #just load your kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"aaanikit","key":"20cd3080d6b8628fef8a299dd1101ec7"}'}

In [5]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Download data

In [None]:
!kaggle datasets download -d origamik/united-airlines-call-center-sentiment-dataset
!kaggle datasets download -d takuok/glove840b300dtxt
!unzip "united-airlines-call-center-sentiment-dataset.zip"
!unzip "glove840b300dtxt.zip"

Data processing

In [14]:
import pandas as pd

calls_df = pd.read_csv('calls.csv')
sentiment_df = pd.read_csv('sentiment_statistics.csv')

# union of 2 dataframes by call_id
merged_df = pd.merge(calls_df, sentiment_df, on="call_id", how="inner")

# drop useless columns
columns_to_drop = [
    "call_id",
    "customer_id",
    "agent_tone",
    "average_sentiment",
    "silence_percent_average",
    "agent_id_x",
    "call_start_datetime",
    "agent_assigned_datetime",
    "call_end_datetime",
    "agent_id_y"
]

final_df = merged_df.drop(columns=columns_to_drop, errors='ignore')
final_df.to_csv("merged_output.csv", index=False)

In [15]:
final_df.head()

Unnamed: 0,call_transcript,customer_tone
0,\n\nAgent: Thank you for calling United Airlin...,angry
1,\n\nAgent: Thank you for calling United Airlin...,neutral
2,\n\nAgent: Thank you for calling United Airlin...,polite
3,\n\nAgent: Thank you for calling United Airlin...,frustrated
4,\n\nAgent: Thank you for calling United Airlin...,polite


делаем энкодер лейблов

In [16]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(final_df.customer_tone)
final_df['customer_tone'] = le.transform(final_df.customer_tone)

обрезаем датасет, т.к. слишком большие размеры

In [26]:
final_df = final_df[:4000]

Определяем токенайзер

- Инициализация: Класс загружает эмбеддинги из файла GloVe, добавляет специальные токены (unk, pad), если они указаны.
- Токенизация и преобразование: При помощи метода doc_to_sequence можно преобразовать строку текста в числовые индексы, которые затем можно использовать.

In [18]:
import spacy
from spacy.tokens import Doc
from spacy import displacy
from scipy.sparse import coo_matrix


class Tokenizer(object):
    def __init__(self, word2idx=None, nlp_model="en_core_web_sm"):
        # https://spacy.io/docs/usage/processing-text
        self.nlp = spacy.load(nlp_model)
        if word2idx is None:
            self.word2idx = {}
            self.idx2word = {}
            self.idx = 0
            self.word2idx['<pad>'] = self.idx  # '<pad>': 0
            self.idx2word[self.idx] = '<pad>'
            self.idx += 1
            self.word2idx['<unk>'] = self.idx  # '<unk>': 1
            self.idx2word[self.idx] = '<unk>'
            self.idx += 1
        else:
            self.word2idx = word2idx
            self.idx2word = {v:k for k,v in word2idx.items()}

    def fit_on_doc(self, doc:spacy.tokens.doc.Doc):
        for word in doc:
            word = str(word).lower()
            if word not in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1

    def text_to_doc(self, text):
        return self.nlp(text)

    def doc_to_sequence(self, doc:spacy.tokens.doc.Doc):
        sequence = []
        for w in doc:
            w = str(w).lower()
            word_id = self.word2idx.get(w, -1)
            if word_id == -1:
                word_id = self.word2idx['<unk>']
            sequence.append(word_id)
        if len(sequence) == 0:
            sequence = [0]
        return np.array(sequence, dtype=np.int32)

    def doc_to_adj(self, doc:spacy.tokens.doc.Doc):
        matrix = np.zeros((len(doc), len(doc))).astype('int32')
        for token in doc:
            for child in token.children:
                matrix[token.i][child.i] = 1
                matrix[child.i][token.i] = 1
        return matrix

In [27]:
from tqdm import tqdm
import numpy as np

tokenizer = Tokenizer()

idx2graph = {}
for i in tqdm(range(final_df.shape[0])):
    text = final_df.call_transcript[i].lower().replace("\n", "").strip()
    doc = tokenizer.text_to_doc(text)
    tokenizer.fit_on_doc(doc)
    adj_matrix = tokenizer.doc_to_adj(doc)
    coo = coo_matrix(adj_matrix)
    idx2graph[i] = np.array([coo.row, coo.col], dtype=np.int32)



100%|██████████| 4000/4000 [10:34<00:00,  6.30it/s]


Example of tokenizer

In [28]:
final_df['call_transcript'][50]

"\n\nAgent: Thank you for calling United Airlines customer service, my name is Sarah how can I help you?\n\nCustomer: Hi, I'm calling to complain about my missing luggage from flight 123 yesterday. I landed and my bag was nowhere to be found. It's been over 24 hours now and I still don't have it!\n\nAgent: I'm sorry to hear that. Can you confirm your name and flight details for me please so I can take a look into this? \n\nCustomer: It's John Smith, flight number 123 from New York to San Francisco, I arrived yesterday at 5pm. \n\nAgent: Thanks, just let me pull up your record... *typing noises* okay I see the flight information here. It looks like your bag was delayed in transferring from the inbound flight. These things unfortunately happen sometimes with connecting flights. Let me check the status... it looks like your bag is still at JFK airport waiting to come out on the next flight today. I apologize for the inconvenience. \n\nCustomer: Are you kidding me? Waiting until today? I'v

In [None]:
doc = tokenizer.text_to_doc(final_df['call_transcript'][50])
tokenizer.doc_to_sequence(doc)

In [31]:
text = final_df.call_transcript[1].lower().replace("\n", "").strip()
doc = tokenizer.nlp(text)
print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print ("-" * 70)
for token in doc:
  print ("{:<15} | {:<8} | {:<15} | {:<20}"
         .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))

Token           | Relation | Head            | Children            
----------------------------------------------------------------------
agent           | npadvmod | thank           | []                  
:               | punct    | thank           | []                  
thank           | ROOT     | thank           | [agent, :, you, for, ,, is]
you             | dobj     | thank           | []                  
for             | prep     | thank           | [calling]           
calling         | pcomp    | for             | [airlines]          
united          | compound | airlines        | []                  
airlines        | dobj     | calling         | [united]            
,               | punct    | thank           | []                  
my              | poss     | name            | []                  
name            | nsubj    | is              | [my]                
is              | conj     | thank           | [name, sam, help]   
sam             | attr     | is       

In [32]:
def load_word_vec(path, word2idx=None, embed_dim=300):
    fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    word_vec = {}
    for line in fin:
        tokens = line.rstrip().split()
        word, vec = ' '.join(tokens[:-embed_dim]), tokens[-embed_dim:]
        if word in word2idx.keys():
            word_vec[word] = np.array(vec, dtype=np.float32)
    return word_vec


def build_embedding_matrix(word2idx, embed_dim=300):
    embedding_matrix = np.zeros((len(word2idx), embed_dim))
    embedding_matrix[1, :] = np.random.uniform(-1/np.sqrt(embed_dim), 1/np.sqrt(embed_dim), (1, embed_dim))

    fname = 'glove.840B.300d.txt'
    word_vec = load_word_vec(fname, word2idx=word2idx, embed_dim=embed_dim)

    for word, i in word2idx.items():
        vec = word_vec.get(word)
        if vec is not None:
            embedding_matrix[i] = vec

    return embedding_matrix

embedding_matrix = build_embedding_matrix(tokenizer.word2idx, 300)

In [33]:
embedding_matrix.shape

(12732, 300)

теперь готовы эмбеддинги для каждой записи

In [21]:
#!mkdir raw
!mv customers.csv /raw

mv: cannot stat 'customers.csv': No such file or directory


In [34]:
import os

class MyTrainDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.process()
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self):
        return "customers.csv"

    @property
    def processed_dir(self):
        return os.path.join(self.root, "train_processed")

    @property
    def raw_file_names(self):
        return ['train.csv']

    @property
    def processed_file_names(self):
        return ['train-data.pt']

    def download(self):
        pass

    def process(self):
        print("Вызов process()")
        data_list = self.read_data()


        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]


        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
        print("process() завершён, данные сохранены.")

    def read_data(self):
        print("Чтение данных...")
        df_train = final_df
        all_data = []

        for i in range(df_train.shape[0]):
            text = df_train.call_transcript.iloc[i].lower().replace("\n", "").strip()
            doc = tokenizer.text_to_doc(text)
            input_ids = tokenizer.doc_to_sequence(doc)
            label = df_train.customer_tone.iloc[i]

            x = torch.tensor(input_ids, dtype=torch.int32).unsqueeze(1)
            edge_index = torch.tensor(idx2graph[i], dtype=torch.long)
            y = torch.tensor([label], dtype=torch.float32)


            data = Data(x=x, edge_index=edge_index, y=y)
            all_data.append(data)

            if i < 5:
                print(f"Граф {i}: x={x.size()}, edge_index={edge_index.size()}, y={y}")

        print("Чтение данных завершено.")
        return all_data


In [35]:
dataset = MyTrainDataset(root='/content')


Processing...


Вызов process()
Чтение данных...
Граф 0: x=torch.Size([473, 1]), edge_index=torch.Size([2, 884]), y=tensor([0.])
Граф 1: x=torch.Size([565, 1]), edge_index=torch.Size([2, 1050]), y=tensor([3.])
Граф 2: x=torch.Size([671, 1]), edge_index=torch.Size([2, 1270]), y=tensor([4.])
Граф 3: x=torch.Size([695, 1]), edge_index=torch.Size([2, 1280]), y=tensor([2.])
Граф 4: x=torch.Size([610, 1]), edge_index=torch.Size([2, 1146]), y=tensor([4.])
Чтение данных завершено.
process() завершён, данные сохранены.
Вызов process()
Чтение данных...
Граф 0: x=torch.Size([473, 1]), edge_index=torch.Size([2, 884]), y=tensor([0.])


Done!


Граф 1: x=torch.Size([565, 1]), edge_index=torch.Size([2, 1050]), y=tensor([3.])
Граф 2: x=torch.Size([671, 1]), edge_index=torch.Size([2, 1270]), y=tensor([4.])
Граф 3: x=torch.Size([695, 1]), edge_index=torch.Size([2, 1280]), y=tensor([2.])
Граф 4: x=torch.Size([610, 1]), edge_index=torch.Size([2, 1146]), y=tensor([4.])
Чтение данных завершено.
process() завершён, данные сохранены.


  self.data, self.slices = torch.load(self.processed_paths[0])


In [36]:
dataset.data



Data(x=[2333235, 1], edge_index=[2, 4353420], y=[4000])

In [46]:
dataset.data.y[:100]



tensor([0., 3., 4., 2., 4., 1., 2., 0., 3., 0., 2., 3., 4., 0., 2., 0., 0., 1.,
        1., 3., 2., 2., 0., 4., 3., 4., 4., 3., 4., 3., 1., 0., 0., 0., 1., 0.,
        4., 2., 2., 1., 0., 3., 3., 1., 1., 1., 4., 0., 1., 1., 0., 0., 2., 4.,
        4., 4., 0., 2., 2., 2., 0., 3., 0., 2., 0., 0., 2., 1., 1., 2., 4., 1.,
        1., 2., 1., 0., 1., 0., 4., 2., 4., 1., 3., 3., 4., 2., 3., 2., 1., 2.,
        0., 4., 1., 0., 3., 4., 0., 3., 4., 2.])

In [37]:
len(dataset)

4000

In [38]:
dataset = dataset.shuffle()
train_len = int(0.8*len(dataset))
train_dataset = dataset[:train_len]
val_dataset = dataset[train_len:]
print(len(train_dataset), len(val_dataset))

3200 800


In [39]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [47]:
class FeedbackModel(nn.Module):
    def __init__(self, embedding_matrix):

        super(FeedbackModel, self).__init__()

        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=False)
        # GCNConv SAGEConv ResGatedGraphConv GraphConv(300, 128)
        # TransformerConv GATv2Conv GATConv(300, 128, heads=4) ChebConv(300, 128, K=2)
        # GCNConv SAGEConv ResGatedGraphConv GraphConv(128, 64)
        # TransformerConv  GATv2Conv GATConv(4*128, 64) ChebConv(128, 64, K=2)
#         self.gru = nn.GRU(256, 256, num_layers=1,
#                           dropout=0, batch_first=True,
#                           bidirectional=False)          # RNN, GRU
        # output: (N, L, D∗Hout), D = 2 if bidirectional=True otherwise 1
        # h_n: (D∗num_layers, N, Hout)
        self.gc1   = GATv2Conv(300, 128)
        self.pool1 = pyg_nn.TopKPooling(128, ratio=0.8)
        self.gc2   = GCNConv(128, 128)
        self.pool2 = pyg_nn.TopKPooling(128, ratio=0.8)
        self.lin1  = nn.Linear(256, 64)
        self.lin2  = nn.Linear(64, 5)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = x.squeeze(1)
        x = self.embed(x)

        x = F.relu(self.gc1(x, edge_index))
        x, edge_index, edge_attr, batch, perm, score = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([pyg_nn.global_max_pool(x, batch), pyg_nn.global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.gc2(x, edge_index))
        x, edge_index, edge_attr, batch, perm, score = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([pyg_nn.global_max_pool(x, batch), pyg_nn.global_mean_pool(x, batch)], dim=1)

        x = x1 + x2
        # x, hn = self.gru(x, None)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        output = F.relu(self.lin2(x))

        return output

In [48]:
model = FeedbackModel(embedding_matrix)
model

FeedbackModel(
  (embed): Embedding(12732, 300)
  (gc1): GATv2Conv(300, 128, heads=1)
  (pool1): TopKPooling(128, ratio=0.8, multiplier=1.0)
  (gc2): GCNConv(128, 128)
  (pool2): TopKPooling(128, ratio=0.8, multiplier=1.0)
  (lin1): Linear(in_features=256, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=5, bias=True)
)

In [54]:
epochs = 60

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=3e-3)
#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=6)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

CrossEntropyLoss()

In [55]:
total_loss = []
for epoch_num in range(epochs):
    model.train()
    total_loss_train = 0
    for i, sample_batched in enumerate(train_loader):
        sample_batched = sample_batched.to(device)
        optimizer.zero_grad()
        outputs = model(sample_batched)
        label = sample_batched.y.to(device)
        loss = criterion(outputs, label.to(torch.long))
        if i % 10 == 0:
            print(f"epoch: {epoch_num} it: {i} loss: {loss.item()}")
        loss.backward()
        total_loss_train += loss.item()
        optimizer.step()

    model.eval()
    total_loss_val = 0
    with torch.no_grad():
        for sample_batched in val_loader:
            sample_batched = sample_batched.to(device)
            outputs = model(sample_batched)
            label = sample_batched.y.to(device)
            loss = criterion(outputs, label.to(torch.long))
            total_loss_val += loss.item()

    #scheduler.step(total_loss_val / len(val_dataset))

    print(f'Epoch: %02.0f | Train Loss: {total_loss_train / len(train_dataset): .3f} | Val Loss: {total_loss_val / len(val_dataset): .3f}' % (epoch_num + 1))
    total_loss.append([total_loss_train / len(train_dataset), total_loss_val / len(val_dataset)])

epoch: 0 it: 0 loss: 1.6063629388809204
epoch: 0 it: 10 loss: 1.5397961139678955
epoch: 0 it: 20 loss: 1.321150779724121
Epoch: 01 | Train Loss:  0.011 | Val Loss:  0.011
epoch: 1 it: 0 loss: 1.4191863536834717
epoch: 1 it: 10 loss: 1.2867873907089233
epoch: 1 it: 20 loss: 1.1841208934783936
Epoch: 02 | Train Loss:  0.010 | Val Loss:  0.009
epoch: 2 it: 0 loss: 1.0218862295150757
epoch: 2 it: 10 loss: 0.9532959461212158
epoch: 2 it: 20 loss: 1.01244056224823
Epoch: 03 | Train Loss:  0.008 | Val Loss:  0.009
epoch: 3 it: 0 loss: 0.9183574318885803
epoch: 3 it: 10 loss: 0.7967538833618164
epoch: 3 it: 20 loss: 0.7647347450256348


KeyboardInterrupt: 