In [1]:
import re
import multiprocessing
from tqdm import tqdm

import pandas as pd
from wordcloud import WordCloud, STOPWORDS
from scipy.stats import norm
from gensim.models import word2vec
# from kaggle.competitions import twosigmanews

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import numpy as np

%matplotlib inline

In [2]:
cpu_count = 2*multiprocessing.cpu_count()-1
print('Number of CPUs: {}'.format(cpu_count))

Number of CPUs: 55


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb

assert torch.cuda.is_available and torch.has_cudnn

In [4]:
torch.cuda.get_device_name(0)

'Tesla P40'

In [5]:
label_map = {
    '0':0, 
    '1':1, 
    '-1':2
}
label_map_reverse = ['Neutral', 'Positive', 'Negative']

In [6]:
TEXT = torchtext.data.ReversibleField(sequential=True,
                                      lower=True,
                                      include_lengths=True,
                                      batch_first=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, preprocessing=lambda x: label_map[x])

In [7]:
train_data = torchtext.data.TabularDataset('news_val_torch.csv',format='csv', 
                                                    skip_header = True,
                                                    fields=[('headline',TEXT),('sentimentClass',LABEL)])
val_data = torchtext.data.TabularDataset('news_test_torch.csv',format='csv', 
                                                  skip_header = True,
                                                  fields=[('headline',TEXT),('sentimentClass',LABEL)])

In [8]:
# train_data = torchtext.data.TabularDataset("news_torch_experiment.csv",format='csv', 
#                                                     skip_header = True,
#                                                     fields=[('headline',TEXT),('sentimentClass',LABEL)])
# val_data = torchtext.data.TabularDataset("news_torch_experiment.csv",format='csv', 
#                                                   skip_header = True,
#                                                   fields=[('headline',TEXT),('sentimentClass',LABEL)])

In [8]:
TEXT.build_vocab(train_data, val_data, 
                 vectors=torchtext.vocab.Vectors(name='wiki-news-300d-1M.vec'))

In [15]:
train_iter = torchtext.data.BucketIterator(train_data,
                                            batch_size=32,
                                            shuffle=True,
                                            sort_key=lambda x: data.interleave_keys(len(x.headline)),
                                            device=torch.device('cuda'))
test_iter = torchtext.data.BucketIterator(val_data,
                                            batch_size=32,
                                            shuffle=True,
                                            sort_key=lambda x: data.interleave_keys(len(x.headline)),
                                            device=torch.device('cuda'))

In [10]:
for batch in train_iter:
    print(batch.sentimentClass)
    break

tensor([1, 2, 2, 1, 0, 2, 1, 1, 2, 2, 1, 0, 2, 2, 2, 2, 1, 1, 0, 0, 1, 1, 2, 2,
        2, 0, 2, 2, 0, 2, 1, 1], device='cuda:0')


In [11]:
def test_model(test_iter, model):
    correct = 0
    total = 0
    model.eval()
    test_loss = 0.0
    i = 0
    for batch in test_iter:
        pred = model(batch.headline)
        outputs = F.softmax(pred, dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        loss = F.cross_entropy(pred, batch.sentimentClass)
        test_loss += loss.item()

        total += batch.sentimentClass.size(0)
        correct += predicted.eq(batch.sentimentClass.view_as(predicted)).sum().item()
        i+=1
        if i > 100:
            break
#     return (100 * correct / total), test_loss/len(test_iter)
    return (100 * correct / total), test_loss/i

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NN(nn.Module):
    def __init__(self, in_size, hidden_size, num_classes, vocab, kernel_size=3, dropout=0):
        super(NN, self).__init__()
        
        self.in_size, self.hidden_size = in_size, hidden_size
        self.num_classes = num_classes
        self.vocab = vocab
        
        self.embedding = nn.Embedding(self.vocab.vectors.shape[0],vocab.vectors.shape[1])
        self.embedding.weight = torch.nn.Parameter(self.vocab.vectors, requires_grad=False)
        
        self.conv1 = nn.Conv1d(in_size, hidden_size, kernel_size=kernel_size, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding=1)
        
        self.linear = torch.nn.Linear(int(hidden_size), self.num_classes)
        
#         self.dropout = nn.Dropout(dropout)
    
    def forward(self, data):
        data = self.embedding(data[0])
        hidden = self.conv1(data.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(hidden.shape[0], hidden.shape[1], hidden.size(-1))
#         hidden = nn.dropout(hidden)
        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(hidden.shape[0], hidden.shape[1], hidden.size(-1))
        
        hidden,_ = torch.max(hidden, dim=1)
        pred = self.linear(hidden)
        pred = torch.tanh(pred)
        return pred

In [None]:
model = NN(in_size=300, hidden_size=100, num_classes=3, vocab=TEXT.vocab, kernel_size=1)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (batch) in enumerate(train_iter):
        model.train()
        optimizer.zero_grad()
        pred = model(batch.headline)
        loss = F.cross_entropy(pred,batch.sentimentClass)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        val_period = 1000
        if i > 0 and i % val_period == 0:
            # validate
            model.eval()
#             trn_acc, trn_loss = test_model(snli_train_iter_for_val, model)
#             trn_acc = None
            val_acc, val_loss = test_model(test_iter, model)
    
            print('Epoch: [{}/{}], Step: [{}/{}], Trn Acc: {:.2f}, Val Acc: {:.2f}, Trn Loss: {:.2f}, Val Loss: {:.2f}'
                  .format(epoch+1, num_epochs, i+1, len(train_iter), 0, val_acc,
                          running_loss/val_period, val_loss))
            running_loss = 0.0

Epoch: [1/20], Step: [1001/45466], Trn Acc: 0.00, Val Acc: 66.00, Trn Loss: 0.89, Val Loss: 0.79
Epoch: [1/20], Step: [2001/45466], Trn Acc: 0.00, Val Acc: 67.26, Trn Loss: 0.82, Val Loss: 0.77
Epoch: [1/20], Step: [3001/45466], Trn Acc: 0.00, Val Acc: 67.57, Trn Loss: 0.81, Val Loss: 0.75
Epoch: [1/20], Step: [4001/45466], Trn Acc: 0.00, Val Acc: 68.19, Trn Loss: 0.80, Val Loss: 0.74
Epoch: [1/20], Step: [5001/45466], Trn Acc: 0.00, Val Acc: 67.98, Trn Loss: 0.79, Val Loss: 0.74
Epoch: [1/20], Step: [6001/45466], Trn Acc: 0.00, Val Acc: 70.02, Trn Loss: 0.79, Val Loss: 0.72
Epoch: [1/20], Step: [7001/45466], Trn Acc: 0.00, Val Acc: 69.00, Trn Loss: 0.78, Val Loss: 0.73
Epoch: [1/20], Step: [8001/45466], Trn Acc: 0.00, Val Acc: 68.56, Trn Loss: 0.78, Val Loss: 0.72
Epoch: [1/20], Step: [9001/45466], Trn Acc: 0.00, Val Acc: 69.59, Trn Loss: 0.76, Val Loss: 0.72
Epoch: [1/20], Step: [10001/45466], Trn Acc: 0.00, Val Acc: 68.87, Trn Loss: 0.77, Val Loss: 0.72
Epoch: [1/20], Step: [11001/4

In [16]:
len(test_iter)

64195

# -------------------------------------

In [7]:
news_df = pd.read_csv("news_val_torch.csv")

In [26]:
from sklearn.model_selection import train_test_split
news_train_df, news_val_df = train_test_split(news_df, test_size=0.01)

In [32]:
news_val_df.to_csv("news_torch_experiment.csv", index=False)

In [31]:
news_val_df.head()

Unnamed: 0,headline,sentimentClass
1281950,CORRECTED-UPDATE 1-Snowden's father criticizes...,-1
917675,"Quarterly Results and Earnings Call Schedules,...",1
829658,"TEXT-S&P release on Fannie Mae, Freddie Mac",0
1051799,"COMSTOCK-ON FEB 29, CLOSED ON SALE OF SOME OF ...",0
1233417,"RPT-MCDONALD'S <MCD.N> CFO SAYS TO RE-IMAGE 1,...",0


# -------------------------------------

In [20]:
news_df = pd.read_csv("news_train_df.csv")

In [21]:
news_df = news_df[['headline',
                   'sentimentClass',
                   'time']]

In [22]:
news_test_df = news_df[ (news_df['time']>='2015-01-01')]
news_train_df = news_df[ (news_df['time']<'2015-01-01')]

In [23]:
news_test_df = news_test_df[['headline',
                             'sentimentClass']]
news_train_df = news_train_df[['headline',
                               'sentimentClass']]

In [24]:
from sklearn.model_selection import train_test_split
news_train_df, news_val_df = train_test_split(news_train_df, test_size=0.2)

In [25]:
news_train_df.to_csv('news_train_torch.csv', index=False)
news_val_df.to_csv('news_val_torch.csv', index=False)
news_test_df.to_csv('news_test_torch.csv', index=False)

# --------------------------------------------------------

In [3]:
news_df = pd.read_csv("news_train_df.csv")

In [6]:
news_df = news_df[['assetCodes', 
                    'assetName', 
                    'sentimentClass', 
                    'headline',
                    'time',
                    'sentimentNegative',
                    'sentimentNeutral',
                    'sentimentPositive']]

In [16]:
news_df.head()

Unnamed: 0,headline,sentimentClass
0,China's Daqing pumps 43.41 mln tonnes of oil i...,-1
1,"FEATURE-In kidnapping, finesse works best",-1
2,PRESS DIGEST - Wall Street Journal - Jan 1,-1
3,PRESS DIGEST - New York Times - Jan 1,-1
4,PRESS DIGEST - New York Times - Jan 1,-1


In [8]:
news_test_df = news_df[ (news_df['time']>='2015-01-01')]
news_train_df = news_df[ (news_df['time']<'2015-01-01')]

In [34]:
# def genData(news_df):
#     data = []
#     target = []
#     for index, row in news_df.iterrows():
#         data.append(row['headline'])
#         target.append([row['sentimentNegative'], row['sentimentNeutral'], row['sentimentPositive']])
#     return data, target

In [38]:
test_data, test_targets = news_test_df['headline'].values.tolist(), \
                            news_test_df[['sentimentNegative', 'sentimentNeutral', 'sentimentPositive']].values.tolist()


In [39]:
from sklearn.model_selection import train_test_split
news_train_df_head = news_test_df.head(20000)
# train_data, train_target = genData(news_train_df_head)
train_data, train_targets = news_train_df_head['headline'].values.tolist(), \
                            news_train_df_head[['sentimentNegative', 'sentimentNeutral', 'sentimentPositive']].values.tolist()

train_data, val_data, train_targets, val_targets = train_test_split(
    train_data, train_targets, test_size=0.2, random_state=42)

In [43]:
torchtext.data.Example.fromlist(val_data)

TypeError: zip argument #1 must support iteration