In [45]:
import pandas as pd
import nltk
import re
import unidecode
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [70]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [46]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
columns = ["sentiments","content"]
data = pd.read_csv("all-data.csv",names=columns, encoding='ISO-8859-1')
data = data.head(200)
data

Unnamed: 0,sentiments,content
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
195,positive,Finnish messaging solutions developer Tecnomen...
196,positive,Finnish metal industry solutions supplier Outo...
197,positive,Finnish metal products company Componenta Oyj ...
198,positive,Finnish office supplies and computer accessori...


In [48]:
classes = {
  class_name: idx for idx, class_name in enumerate(data['sentiments'].unique().tolist())
    }
data['sentiments'] = data['sentiments'].apply(lambda x: classes[x])

In [49]:
data.head()

Unnamed: 0,sentiments,content
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [50]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def text_normalize(text):
  text = text.lower()
  text = unidecode.unidecode(text)
  text = text.strip()
  text = re.sub(r'[^\w\s]', '', text)
  text = ' '.join([word for word in text.split(' ') if word not in
  english_stop_words])
  text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
  return text

In [51]:
text_normalize("I like running")

'like run'

In [52]:
data['content'] = data['content'].apply(lambda x: text_normalize(x))
data.head(30)

Unnamed: 0,sentiments,content
0,0,accord gran compani plan move product russia ...
1,0,technopoli plan develop stage area less 100000...
2,1,intern electron industri compani elcoteq laid ...
3,2,new product plant compani would increas capac ...
4,2,accord compani updat strategi year 20092012 b...
5,2,financ aspocomp growth aspocomp aggress pursu ...
6,2,last quarter 2010 componenta net sale doubl e...
7,2,third quarter 2010 net sale increas 52 eur 2...
8,2,oper profit rose eur 131 mn eur 87 mn correspo...
9,2,oper profit total eur 211 mn eur 186 mn 2007 ...


In [53]:
vocab = []
for sentence in data['content'].tolist():
  tokens = sentence.split()
  for token in tokens:
    if token not in vocab:
        vocab.append(token)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [54]:
vocab

['accord',
 'gran',
 'compani',
 'plan',
 'move',
 'product',
 'russia',
 'although',
 'grow',
 'technopoli',
 'develop',
 'stage',
 'area',
 'less',
 '100000',
 'squar',
 'meter',
 'order',
 'host',
 'work',
 'comput',
 'technolog',
 'telecommun',
 'statement',
 'said',
 'intern',
 'electron',
 'industri',
 'elcoteq',
 'laid',
 'ten',
 'employe',
 'tallinn',
 'facil',
 'contrari',
 'earlier',
 'layoff',
 'contract',
 'rank',
 'offic',
 'worker',
 'daili',
 'postime',
 'report',
 'new',
 'plant',
 'would',
 'increas',
 'capac',
 'meet',
 'expect',
 'demand',
 'improv',
 'use',
 'raw',
 'materi',
 'therefor',
 'profit',
 'updat',
 'strategi',
 'year',
 '20092012',
 'baswar',
 'target',
 'longterm',
 'net',
 'sale',
 'growth',
 'rang',
 '20',
 '40',
 'oper',
 'margin',
 '10',
 'financ',
 'aspocomp',
 'aggress',
 'pursu',
 'increasingli',
 'focus',
 'hdi',
 'print',
 'circuit',
 'board',
 'pcb',
 'last',
 'quarter',
 '2010',
 'componenta',
 'doubl',
 'eur131m',
 'eur76m',
 'period',
 'zer

In [55]:
print(vocab_size)
print(data['content'].count())
sentence_count = data['content'].count()

1125
200


In [56]:
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

In [57]:
word_to_idx

{'accord': 0,
 'gran': 1,
 'compani': 2,
 'plan': 3,
 'move': 4,
 'product': 5,
 'russia': 6,
 'although': 7,
 'grow': 8,
 'technopoli': 9,
 'develop': 10,
 'stage': 11,
 'area': 12,
 'less': 13,
 '100000': 14,
 'squar': 15,
 'meter': 16,
 'order': 17,
 'host': 18,
 'work': 19,
 'comput': 20,
 'technolog': 21,
 'telecommun': 22,
 'statement': 23,
 'said': 24,
 'intern': 25,
 'electron': 26,
 'industri': 27,
 'elcoteq': 28,
 'laid': 29,
 'ten': 30,
 'employe': 31,
 'tallinn': 32,
 'facil': 33,
 'contrari': 34,
 'earlier': 35,
 'layoff': 36,
 'contract': 37,
 'rank': 38,
 'offic': 39,
 'worker': 40,
 'daili': 41,
 'postime': 42,
 'report': 43,
 'new': 44,
 'plant': 45,
 'would': 46,
 'increas': 47,
 'capac': 48,
 'meet': 49,
 'expect': 50,
 'demand': 51,
 'improv': 52,
 'use': 53,
 'raw': 54,
 'materi': 55,
 'therefor': 56,
 'profit': 57,
 'updat': 58,
 'strategi': 59,
 'year': 60,
 '20092012': 61,
 'baswar': 62,
 'target': 63,
 'longterm': 64,
 'net': 65,
 'sale': 66,
 'growth': 67,
 'r

In [58]:
import torch
import torch.nn as nn
seed = 1
torch.manual_seed(seed)
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [59]:
test_size = 0.2
is_shuffle = True
texts = data['content'].to_list()
labels = data['sentiments'].tolist()
X_train, X_test, y_train, y_test = train_test_split(
  texts, labels,
  test_size=test_size,
  random_state=seed,
  shuffle=is_shuffle
  )

In [60]:
train_size = len(X_train)
test_size = len(X_test)

In [61]:
EDGE = 2 # 0:d2w 1:d2w+w2w 2:d2w+w2w+d2d
NODE = 0 # 0:one-hot #1:BERT 
NUM_LAYERS = 2 
HIDDEN_DIM = 200
DROP_OUT = 0.5
LR = 0.02
WEIGHT_DECAY = 0
EARLY_STOPPING = 10
NUM_EPOCHS = 200

In [62]:
tokenize_sentences = []
doc_words = []
def transform(text, word_to_idx):
    for sentence in text:
        for w in sentence.split():
            if word_to_idx[w]:
                # w_ids = word_to_idx[w]
                doc_words.append(w)
            else:
                continue
        tokenize_sentences.append(doc_words)

transform(text = texts, word_to_idx=word_to_idx)
tokenize_sentences


[['gran',
  'compani',
  'plan',
  'move',
  'product',
  'russia',
  'although',
  'compani',
  'grow',
  'technopoli',
  'plan',
  'develop',
  'stage',
  'area',
  'less',
  '100000',
  'squar',
  'meter',
  'order',
  'host',
  'compani',
  'work',
  'comput',
  'technolog',
  'telecommun',
  'statement',
  'said',
  'intern',
  'electron',
  'industri',
  'compani',
  'elcoteq',
  'laid',
  'ten',
  'employe',
  'tallinn',
  'facil',
  'contrari',
  'earlier',
  'layoff',
  'compani',
  'contract',
  'rank',
  'offic',
  'worker',
  'daili',
  'postime',
  'report',
  'new',
  'product',
  'plant',
  'compani',
  'would',
  'increas',
  'capac',
  'meet',
  'expect',
  'increas',
  'demand',
  'would',
  'improv',
  'use',
  'raw',
  'materi',
  'therefor',
  'increas',
  'product',
  'profit',
  'compani',
  'updat',
  'strategi',
  'year',
  '20092012',
  'baswar',
  'target',
  'longterm',
  'net',
  'sale',
  'growth',
  'rang',
  '20',
  '40',
  'oper',
  'profit',
  'margin'

In [63]:
node_size = train_size + vocab_size + test_size

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from math import log
row = []
col = []
weight = []

In [65]:
if EDGE >= 1:
    window_size = 20
    total_W = 0
    word_occurrence = {}
    word_pair_occurrence = {}

    def ordered_word_pair(a, b):
        if a > b:
            return b, a
        else:
            return a, b

    def update_word_and_word_pair_occurrence(q):
        unique_q = list(set(q))
        for i in unique_q:
            try:
                word_occurrence[i] += 1
            except:
                word_occurrence[i] = 1
        for i in range(len(unique_q)):
            for j in range(i+1, len(unique_q)):
                word1 = unique_q[i]
                word2 = unique_q[j]
                word1, word2 = ordered_word_pair(word1, word2)
                try:
                    word_pair_occurrence[(word1, word2)] += 1
                except:
                    word_pair_occurrence[(word1, word2)] = 1


    for ind in tqdm(range(train_size+test_size)):
        words = tokenize_sentences[ind]
        q = []
        # push the first (window_size) words into a queue
        for i in range(min(window_size, len(words))):
            q += [word_to_idx[words[i]]]
        # update the total number of the sliding windows
        total_W += 1
        # update the number of sliding windows that contain each word and word pair
        update_word_and_word_pair_occurrence(q)

        now_next_word_index = window_size
        # pop the first word out and let the next word in, keep doing this until the end of the document
        while now_next_word_index<len(words):
            q.pop(0)
            q += [word_to_idx[words[now_next_word_index]]]
            now_next_word_index += 1
            # update the total number of the sliding windows
            total_W += 1
            # update the number of sliding windows that contain each word and word pair
            update_word_and_word_pair_occurrence(q)

    for word_pair in word_pair_occurrence:
        i = word_pair[0]
        j = word_pair[1]
        count = word_pair_occurrence[word_pair]
        word_freq_i = word_occurrence[i]
        word_freq_j = word_occurrence[j]
        pmi = log((count * total_W) / (word_freq_i * word_freq_j))
        if pmi <=0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(pmi)
        row.append(train_size + j)
        col.append(train_size + i)
        weight.append(pmi)

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [00:37<00:00,  5.39it/s]


In [74]:
#get each word appears in which document
word_doc_list = {}
for word in vocab:
    word_doc_list[word]=[]

for i in range(len(tokenize_sentences)):
    doc_words = tokenize_sentences[i]
    unique_words = set(doc_words)
    for word in unique_words:
        exsit_list = word_doc_list[word]
        exsit_list.append(i)
        word_doc_list[word] = exsit_list

#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

# term frequency
doc_word_freq = {}

for doc_id in range(len(tokenize_sentences)):
    words = tokenize_sentences[doc_id]
    for word in words:
        word_id = word_to_idx[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

In [76]:
for i in range(len(tokenize_sentences)):
    words = tokenize_sentences[i]
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_to_idx[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

In [78]:
import nltk

if EDGE>=2:
    tokenize_sentences_set = [set(s) for s in tokenize_sentences]
    jaccard_threshold = 0.2
    for i in tqdm(range(len(tokenize_sentences))):
        for j in range(i+1, len(tokenize_sentences)):
            jaccard_w = 1 - nltk.jaccard_distance(tokenize_sentences_set[i], tokenize_sentences_set[j])
            if jaccard_w > jaccard_threshold:
                if i < train_size:
                    row.append(i)
                else:
                    row.append(i + vocab_size)
                if j < train_size:
                    col.append(j)
                else:
                    col.append(vocab_size + j)
                weight.append(jaccard_w)
                if j < train_size:
                    row.append(j)
                else:
                    row.append(j + vocab_size)
                if i < train_size:
                    col.append(i)
                else:
                    col.append(vocab_size + i)
                weight.append(jaccard_w)

In [79]:
import scipy.sparse as sp
adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [80]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo(), d_inv_sqrt
    
adj, norm_item = normalize_adj(adj + sp.eye(adj.shape[0]))


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

adj = sparse_mx_to_torch_sparse_tensor(adj)

In [81]:
import numpy as np
if NODE == 0:
    features = np.arange(node_size)
    features = torch.FloatTensor(features).to(device)
else:
    from flair.embeddings import TransformerDocumentEmbeddings, TransformerWordEmbeddings
    from flair.data import Sentence
    doc_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
    word_embedding = TransformerWordEmbeddings('bert-base-uncased', layers='-1',subtoken_pooling="mean")

    sent_embs = []
    word_embs = {}

    for ind in tqdm(range(train_size+test_size)):
        sent = tokenize_sentences[ind]
        sentence = Sentence(" ".join(sent[:512]),use_tokenizer=False)
        doc_embedding.embed(sentence)
        sent_embs.append(sentence.get_embedding().tolist())
        words = Sentence(" ".join(sent[:512]),use_tokenizer=False)
        word_embedding.embed(words)
        for token in words:
            word = token.text
            embedding = token.embedding.tolist()
            if word not in word_embs:
                word_embs[word] = embedding
            else:
                word_embs[word] = np.minimum(word_embs[word], embedding)

    word_embs_list = []
    for word in vocab:
        word_embs_list.append(word_embs[word])

    features = sent_embs[:train_size] + word_embs_list + sent_embs[train_size:]

    import scipy.sparse as sp
    def preprocess_features(features):
        """Row-normalize feature matrix and convert to tuple representation"""
        rowsum = np.array(features.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
        features = r_mat_inv.dot(features)
        return features

    features = preprocess_features(sp.csr_matrix(features)).todense()
    features = torch.FloatTensor(features).to(device)

In [114]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module

class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """
    def __init__(self, in_features, out_features,drop_out=0,activation=None,bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation = activation 
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters_uniform()

    def reset_parameters_uniform(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        input = self.dropout(input)
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None and self.activation is not None:
            output = self.activation(output + self.bias)
            return output
        elif self.bias is not None and self.activation is not None:
            output = self.activation(output)
            return output
        elif self.bias is None and self.activation is not None:
            output = self.activation(output)
            return output
        else:
            return output
    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'


In [115]:
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, n_layers = 2):
        super(GCN, self).__init__()
        self.n_layers = n_layers
        self.gc_list = []
        if n_layers >= 2:
            self.gc1 = GraphConvolution(nfeat, nhid,activation = nn.ReLU(),drop_out=dropout)
            self.gc_list = nn.ModuleList([GraphConvolution(nhid, nhid, activation = nn.ReLU()) for _ in range(self.n_layers-2)])
            self.gcf = GraphConvolution(nhid, nclass,dropout)
        else:
            self.gc1 = GraphConvolution(nfeat, nclass, dropout)

    def forward(self, x, adj):
        if self.n_layers>=2:
            x = self.gc1(x, adj)
            for i in range(self.n_layers-2):
                x = self.gc_list[i](x,adj)
            x = self.gcf(x,adj)
        else:
            x = self.gc1(x, adj)
        return x

In [116]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
model = GCN(nfeat=node_size, nhid=HIDDEN_DIM, nclass=3, dropout=DROP_OUT,n_layers=NUM_LAYERS).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

In [98]:
def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)

In [118]:
def generate_train_val(train_pro=0.9):
    real_train_size = int(train_pro*train_size)
    val_size = train_size-real_train_size
    idx_train = np.random.choice(train_size, real_train_size,replace=False)
    idx_train.sort()
    idx_val = []
    pointer = 0
    for v in range(train_size):
        if pointer<len(idx_train) and idx_train[pointer] == v:
            pointer +=1
        else:
            idx_val.append(v)
    idx_test = range(train_size+vocab_size, node_size)
    return idx_train, idx_val, idx_test

idx_train, idx_val, idx_test = generate_train_val()

In [121]:
idx_train = torch.tensor(idx_train, dtype=torch.long)
labels = torch.tensor(labels)  # Chuyển labels thành tensor

In [123]:
features = features.to(device)
adj = adj.to(device)
labels = labels.to(device)
idx_train = torch.tensor(idx_train).to(device)
idx_val = torch.tensor(idx_val).to(device)
idx_test = torch.tensor(idx_test).to(device)

  idx_train = torch.tensor(idx_train).to(device)


In [124]:
import time

def train_model(show_result = True):
    val_loss = []
    for epoch in range(NUM_EPOCHS):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output= model(features, adj)
        loss_train = criterion(output[idx_train], labels[idx_train])
        acc_train = cal_accuracy(output[idx_train], labels[idx_train])
        loss_train.backward()
        optimizer.step()
        model.eval()
        output = model(features, adj)
        loss_val = criterion(output[idx_val], labels[idx_val])
        val_loss.append(loss_val.item())
        acc_val = cal_accuracy(output[idx_val], labels[idx_val])
        if show_result:
            print(  'Epoch: {:04d}'.format(epoch+1),
                    'loss_train: {:.4f}'.format(loss_train.item()),
                    'acc_train: {:.4f}'.format(acc_train),
                    'loss_val: {:.4f}'.format(loss_val.item()),
                    'acc_val: {:.4f}'.format(acc_val),
                    'time: {:.4f}s'.format(time.time() - t))
        
        if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
            if show_result:
                print("Early Stopping...")
            break

train_model()

Epoch: 0001 loss_train: 21.8206 acc_train: 0.2569 loss_val: 10.0518 acc_val: 0.1875 time: 0.5083s
Epoch: 0002 loss_train: 6.7571 acc_train: 0.7361 loss_val: 4.5132 acc_val: 0.6250 time: 0.0584s
Epoch: 0003 loss_train: 2.7782 acc_train: 0.7847 loss_val: 3.9218 acc_val: 0.6875 time: 0.0518s
Epoch: 0004 loss_train: 1.6826 acc_train: 0.8611 loss_val: 5.0532 acc_val: 0.8125 time: 0.0526s
Epoch: 0005 loss_train: 1.2085 acc_train: 0.8819 loss_val: 6.2136 acc_val: 0.8125 time: 0.0491s
Epoch: 0006 loss_train: 0.4823 acc_train: 0.8958 loss_val: 7.1488 acc_val: 0.8125 time: 0.0429s
Epoch: 0007 loss_train: 0.4348 acc_train: 0.9028 loss_val: 7.9223 acc_val: 0.8125 time: 0.0321s
Epoch: 0008 loss_train: 0.3473 acc_train: 0.9167 loss_val: 8.5890 acc_val: 0.8125 time: 0.0321s
Epoch: 0009 loss_train: 0.3330 acc_train: 0.9167 loss_val: 9.1677 acc_val: 0.8125 time: 0.0323s
Epoch: 0010 loss_train: 0.3326 acc_train: 0.9097 loss_val: 9.6714 acc_val: 0.8125 time: 0.0315s
Epoch: 0011 loss_train: 0.3855 acc_tra

In [107]:
features = torch.diag_embed(features.squeeze())

In [125]:
from sklearn.metrics import f1_score, accuracy_score
def test():
    model.eval()
    output = model(features, adj)
    predictions = torch.argmax(output[idx_test],-1).cpu().tolist()
    acc = accuracy_score(y_test,predictions)
    f11 = f1_score(y_test,predictions, average='macro')
    f12 = f1_score(y_test,predictions, average = 'weighted')
    return acc, f11, f12
print(test())

(0.85, 0.45945945945945943, 0.7810810810810811)


In [126]:
model

GCN(
  (gc1): GraphConvolution (1325 -> 200)
  (gc_list): ModuleList()
  (gcf): GraphConvolution (200 -> 3)
)