In [25]:
%pwd
# %cd ..

'/home/yukikongju/Projects/tidytuesday/financials_news_sentimentanalysis'

In [230]:
import pandas as pd
import torch
import nltk

from torch import optim, nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

### Get the data

In [27]:
df = pd.read_csv('data/data.csv')
df = df[:500]

In [28]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


### Build corpus

In [29]:
corpus = list(set([word.lower() for sentence in df['Sentence'].tolist() for word in sentence.split(' ')]))


In [65]:
len(corpus)

3110

### Build Word Vectorizer pipeline

In [228]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(Normalizer, self).__init__()
#         self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences):
        return [self.normalize(sentence) for sentence in sentences]
    
    def normalize(self, sentence):
        pass
    
class LemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(LemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            words.append(self.lemmatizer.lemmatize(word))
        return ' '.join(words)

class OneHotVectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(OneHotVectorizer, self).__init__()
        self.vectorizer = CountVectorizer(binary=True)
    
    def fit(self, sentences, labels = None):
        return self
    
    def transform(self, sentences):
        freqs = self.vectorizer.fit_transform(sentences)
        return freqs.toarray()
    

In [231]:
pipeline = Pipeline([
    ('normalizer', LemmerNormalizer()),
    ('vectorizer', OneHotVectorizer())
])


### Build Training and Testing Set

In [232]:
sentiment_dict = {'positive': 1, 'negative': 0, 'neutral': 2}
y = df['Sentiment'].apply(lambda x: sentiment_dict.get(str(x))).tolist()


X = torch.tensor(pipeline.fit_transform(df['Sentence'].tolist()), dtype=torch.long)

In [233]:


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [235]:
x_train.shape
print(X.shape[1])
print(X.shape)

2658
torch.Size([500, 2658])


### Build Model

In [211]:
class SimpleNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, num_classes):
        super(SimpleNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.num_classes = num_classes

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.fct1 = nn.Linear(self.embedding_dim, 64)
        self.fct2 = nn.Linear(64, self.num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fct1(x)
        x = torch.tanh(self.fct2(x))
        return x

### Train Model

In [212]:
def training(n_epochs, model, loss_fn, optim, x_train, y_train):
    losses = []
    for epoch in range(n_epochs):
        for word_vec, label in zip(x_train, y_train):
            out = model(word_vec)
            loss = loss_fn(out, get_label_tensor(label))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        losses.append([epoch, loss])
        
        print(f"Epoch {epoch}, Loss {loss}")

def get_label_tensor(label):
    tensor = torch.zeros(3, dtype=torch.long)
    tensor[sentiment_dict.get(label)] = 1.0
    return tensor

print(get_label_tensor('positive'))

tensor([0, 1, 0])


In [213]:
# vocab_size, embedding_dim, context_dim = len(corpus), 50, 2
vocab_size, embedding_dim, context_dim = X.shape[1], 50, 2

num_classes = len(df['Sentiment'].unique())
model = SimpleNN(vocab_size, embedding_dim, context_dim, num_classes)
optimizer = optim.SGD(model.parameters(), lr = 1e-4)
loss_fn = nn.NLLLoss()

In [214]:
training(100, model, loss_fn, optim, x_train, y_train)

ValueError: Expected input batch_size (2794) to match target batch_size (3).

In [223]:
# print(x_train[0])
embed = nn.Embedding(vocab_size, 50)
linear = nn.Linear(50, 3)
out1 = embed(x_train[0])
out2 = torch.sigmoid(linear(out1))
out2[:1].shape

torch.Size([1, 3])

In [134]:
layer = nn.Linear(X.shape[1], 3)
layer(x_train[0].unsqueeze(0))

tensor([[-0.0161, -0.0315, -0.0420]], grad_fn=<AddmmBackward0>)

In [218]:
train_data = TensorDataset(torch.FloatTensor(x_train), torch.FloatTensor(y_train))

TypeError: expected TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)) (got TensorOptions(dtype=long int, device=cpu, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)))