In [133]:
import pandas as pd
import nltk
import re
import unidecode
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [134]:
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [135]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [136]:
columns = ["sentiments","content"]
data = pd.read_csv("all-data.csv",names=columns, encoding='ISO-8859-1')
data = data.head(200)
data

Unnamed: 0,sentiments,content
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
195,positive,Finnish messaging solutions developer Tecnomen...
196,positive,Finnish metal industry solutions supplier Outo...
197,positive,Finnish metal products company Componenta Oyj ...
198,positive,Finnish office supplies and computer accessori...


In [137]:
classes = {
  class_name: idx for idx, class_name in enumerate(data['sentiments'].unique().tolist())
    }
data['sentiments'] = data['sentiments'].apply(lambda x: classes[x])

In [138]:
data.head()

Unnamed: 0,sentiments,content
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [139]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def text_normalize(text):
  text = text.lower()
  text = unidecode.unidecode(text)
  text = text.strip()
  text = re.sub(r'[^\w\s]', '', text)
  text = ' '.join([word for word in text.split(' ') if word not in
  english_stop_words])
  text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
  return text

In [140]:
text_normalize("I like running")

'like run'

In [141]:
data['content'] = data['content'].apply(lambda x: text_normalize(x))
data.head(30)

Unnamed: 0,sentiments,content
0,0,accord gran compani plan move product russia ...
1,0,technopoli plan develop stage area less 100000...
2,1,intern electron industri compani elcoteq laid ...
3,2,new product plant compani would increas capac ...
4,2,accord compani updat strategi year 20092012 b...
5,2,financ aspocomp growth aspocomp aggress pursu ...
6,2,last quarter 2010 componenta net sale doubl e...
7,2,third quarter 2010 net sale increas 52 eur 2...
8,2,oper profit rose eur 131 mn eur 87 mn correspo...
9,2,oper profit total eur 211 mn eur 186 mn 2007 ...


In [142]:
vocab = []
for sentence in data['content'].tolist():
  tokens = sentence.split()
  for token in tokens:
    if token not in vocab:
      vocab.append(token)
 
vocab.append('UNK')
vocab.append('PAD')
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [143]:
vocab

['accord',
 'gran',
 'compani',
 'plan',
 'move',
 'product',
 'russia',
 'although',
 'grow',
 'technopoli',
 'develop',
 'stage',
 'area',
 'less',
 '100000',
 'squar',
 'meter',
 'order',
 'host',
 'work',
 'comput',
 'technolog',
 'telecommun',
 'statement',
 'said',
 'intern',
 'electron',
 'industri',
 'elcoteq',
 'laid',
 'ten',
 'employe',
 'tallinn',
 'facil',
 'contrari',
 'earlier',
 'layoff',
 'contract',
 'rank',
 'offic',
 'worker',
 'daili',
 'postime',
 'report',
 'new',
 'plant',
 'would',
 'increas',
 'capac',
 'meet',
 'expect',
 'demand',
 'improv',
 'use',
 'raw',
 'materi',
 'therefor',
 'profit',
 'updat',
 'strategi',
 'year',
 '20092012',
 'baswar',
 'target',
 'longterm',
 'net',
 'sale',
 'growth',
 'rang',
 '20',
 '40',
 'oper',
 'margin',
 '10',
 'financ',
 'aspocomp',
 'aggress',
 'pursu',
 'increasingli',
 'focus',
 'hdi',
 'print',
 'circuit',
 'board',
 'pcb',
 'last',
 'quarter',
 '2010',
 'componenta',
 'doubl',
 'eur131m',
 'eur76m',
 'period',
 'zer

In [144]:
print(vocab_size)
print(data['content'].count())
sentence_count = data['content'].count()

1127
200


In [145]:
def transform(text, word_to_idx, max_seq_len):
  tokens = []
  for w in text.split():
    try:
        w_ids = word_to_idx[w]
    except:
        w_ids = word_to_idx['UNK']
        tokens.append(w_ids)
 
  if len(tokens) < max_seq_len:
    tokens += [word_to_idx['PAD']] * (max_seq_len-len(tokens))
  elif len(tokens) > max_seq_len:
    tokens = tokens[:max_seq_len]
 
  return tokens

In [146]:
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

In [147]:
word_to_idx

{'accord': 0,
 'gran': 1,
 'compani': 2,
 'plan': 3,
 'move': 4,
 'product': 5,
 'russia': 6,
 'although': 7,
 'grow': 8,
 'technopoli': 9,
 'develop': 10,
 'stage': 11,
 'area': 12,
 'less': 13,
 '100000': 14,
 'squar': 15,
 'meter': 16,
 'order': 17,
 'host': 18,
 'work': 19,
 'comput': 20,
 'technolog': 21,
 'telecommun': 22,
 'statement': 23,
 'said': 24,
 'intern': 25,
 'electron': 26,
 'industri': 27,
 'elcoteq': 28,
 'laid': 29,
 'ten': 30,
 'employe': 31,
 'tallinn': 32,
 'facil': 33,
 'contrari': 34,
 'earlier': 35,
 'layoff': 36,
 'contract': 37,
 'rank': 38,
 'offic': 39,
 'worker': 40,
 'daili': 41,
 'postime': 42,
 'report': 43,
 'new': 44,
 'plant': 45,
 'would': 46,
 'increas': 47,
 'capac': 48,
 'meet': 49,
 'expect': 50,
 'demand': 51,
 'improv': 52,
 'use': 53,
 'raw': 54,
 'materi': 55,
 'therefor': 56,
 'profit': 57,
 'updat': 58,
 'strategi': 59,
 'year': 60,
 '20092012': 61,
 'baswar': 62,
 'target': 63,
 'longterm': 64,
 'net': 65,
 'sale': 66,
 'growth': 67,
 'r

In [148]:
import torch
import torch.nn as nn
seed = 1
torch.manual_seed(seed)
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [149]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = data['content'].tolist()
labels = data['sentiments'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
texts, labels,
test_size=val_size,
random_state=seed,
shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
X_train, y_train,
test_size=val_size,
random_state=seed,
shuffle=is_shuffle
)

In [150]:
class FinancialNews(Dataset):
  def __init__(
  self,
  X, y,
  word_to_idx,
  max_seq_len,
  transform=None
  ):
    self.texts = X
    self.labels = y
    self.word_to_idx = word_to_idx
    

    self.max_seq_len = max_seq_len
    self.transform = transform
 
  def __len__(self):
    return len(self.texts)
 
  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
 
    if self.transform:
        text = self.transform(
        text,
        self.word_to_idx,
        self.max_seq_len
        )
    text = torch.tensor(text)
 
    return text, label

In [151]:
max_seq_len = 32
 
train_dataset = FinancialNews(
  X_train, y_train,
  word_to_idx=word_to_idx,
  max_seq_len=max_seq_len,
  transform=transform
  )
val_dataset = FinancialNews(
  X_val, y_val,
  word_to_idx=word_to_idx,
  max_seq_len=max_seq_len,
  transform=transform
  )
test_dataset = FinancialNews(
  X_test, y_test,
  word_to_idx=word_to_idx,
  max_seq_len=max_seq_len,
  transform=transform
  )
 
train_batch_size = 128
test_batch_size = 8
 
train_loader = DataLoader(
  train_dataset,
  batch_size=train_batch_size,
  shuffle=True
  )
val_loader = DataLoader(
  val_dataset,
  batch_size=test_batch_size,
  shuffle=False
  )
test_loader = DataLoader(
  test_dataset,
  batch_size=test_batch_size,
  shuffle=False
  )

In [None]:
import torch.nn as nn

class SentimentClassifier(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim,
        hidden_size, n_layers, n_classes,
        dropout_prob
    ):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, n_classes)

    def forward(self, x):
        x = self.embedding(x)              # Embedding layer
        x, hn = self.rnn(x)                # RNN layer
        x = x[:, -1, :]                    # Lấy output của bước thời gian cuối
        x = self.norm(x)                   # Normalization
        x = self.dropout(x)                # Dropout
        x = self.fc1(x)                    # Fully connected layer 1
        x = self.relu(x)                   # Activation
        x = self.fc2(x)                    # Fully connected layer 2 (output)
        return x


In [153]:
n_classes = len(list(classes.keys()))
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
model = SentimentClassifier(
  vocab_size=vocab_size,
  embedding_dim=embedding_dim,
  hidden_size=hidden_size,
  n_layers=n_layers,
  n_classes=n_classes,
  dropout_prob=dropout_prob
  ).to(device)

In [154]:
lr = 1e-4
epochs = 50
 
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
model.parameters(),
lr=lr
 )

In [155]:
def fit(
  model,
  train_loader,
  val_loader,
  criterion,
  optimizer,
 device,
  epochs
  ):
    train_losses = []
    val_losses = []
 
    for epoch in range(epochs):
        batch_train_losses = []
        model.train()
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
        
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
 
            batch_train_losses.append(loss.item())
 
    train_loss = sum(batch_train_losses) / len(batch_train_losses)
    train_losses.append(train_loss)
 
    val_loss, val_acc = evaluate(
        model, val_loader,
        criterion, device
    )
    val_losses.append(val_loss)
 
    print(f"EPOCH {epoch + 1}:\tTrain loss: {train_loss:.4f}\tVal loss: {val_loss:.4f}")

 
    return train_losses, val_losses
def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
    loss = sum(losses) / len(losses)
    acc = correct / total
    
    return loss, acc

In [156]:
train_losses, val_losses = fit(
  model,
  train_loader,
  val_loader,
  criterion,
  optimizer,
  device,
  epochs
) 

EPOCH 50:	Train loss: 0.6282	Val loss: 0.6304


In [158]:
val_loss, val_acc = evaluate(
    model,
    val_loader,
    criterion,
    device
 )
test_loss, test_acc = evaluate(
    model,
    test_loader,
    criterion,
    device)
print('Evaluation on val/test dataset')
print('Val accuracy: ', val_acc)
print('Test accuracy: ', test_acc)

Evaluation on val/test dataset
Val accuracy:  0.85
Test accuracy:  0.90625
