In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

import time

cpu


In [2]:
 # TextCNN Parameter
num_classes = 3
batch_size = 64
word_list = []
vocab = []
word2idx = []
vocab_size = 0

In [3]:
def make_data(sentences, labels):
    # TextCNN Parameter
    global num_classes 
    num_classes = len(set(labels))  
    global batch_size 
    batch_size = 64
    global word_list 
    word_list = " ".join(sentences).split()
    global vocab
    vocab = list(set(word_list))
    global word2idx
    word2idx = {w: i for i, w in enumerate(vocab)}
    global vocab_size
    vocab_size = len(vocab)
    inputs = []
    for sen in sentences:
        inputs.append([word2idx[n] for n in sen.split()])
    targets = []
    for out in labels:
        targets.append(out) # To using Torch Softmax Loss function
    return inputs, targets

In [4]:
## The implementation of cnn model structure comes from the Modification of https://github.com/graykode/nlp-tutorial and https://wmathor.com/index.php/archives/1445/.
## The encapsulation of functions, cross validation, model training, and the use of our dataset come from my code writing.
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.filter_sizes = (2, 3, 4)
        self.embed = 300
        self.num_filters = 256
        self.dropout = 0.5
        self.num_classes = num_classes
        self.n_vocab = vocab_size
        #The  character is padded to 0 by padding_idx</pad>
        self.embedding = nn.Embedding(self.n_vocab, self.embed, padding_idx=word2idx['<PAD>'])
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, self.num_filters, (k, self.embed)) for k in self.filter_sizes])
        
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.num_filters * len(self.filter_sizes), self.num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, x):
        out = self.embedding(x)
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [5]:

def cnn(train_dataset,test_dataset):
    train_loader = Data.DataLoader(
    dataset=train_dataset,      # Data, encapsulated in the data.tensorDataset()
    batch_size=batch_size,      # size
    shuffle=True,               
    num_workers=2,              # multiprocess
    )
    test_loader = Data.DataLoader(
        dataset=test_dataset,      
        batch_size=batch_size,      
        shuffle=True,               
        num_workers=2, 
    )
    model = TextCNN().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Training
    for epoch in range(1):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            if (epoch + 1) % 10 == 0:
                print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    test_acc_list = []
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            pred = output.max(1, keepdim=True)[1]                           
            correct += pred.eq(target.view_as(pred)).sum().item()

    # test_loss /= len(test_loader.dataset)
    # test_loss_list.append(test_loss)
    test_acc_list.append(100. * correct / len(test_loader.dataset))
    print('Accuracy: {}/{} ({:.4f}%)\n'.format(correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))

In [6]:
def cnn_cv(df_name):
    df = pd.read_csv(df_name,header=0)
    df = df[1:1000]
    sentences = list(df['sw_exclude'])
    labels = list(df['sentiment'])    
    l = []
    if -1 in labels:
        num_classes = len(set(labels))
        if num_classes == 2:
            for i in labels:
                if i == -1:
                    l.append(0)
                if i == 1:
                    l.append(1)
        if num_classes == 3:
            for i in labels:
                if i == -1:
                    l.append(2)
                if i == 0:
                    l.append(0)
                if i== 1:
                    l.append(1)

        labels = l
        print('success')
    else:
        labels = labels
    PAD = ' <PAD>'  # Fill in sentences of different lengths
    pad_size =  64     # Fill as the same length

    for i in range(len(sentences)):
        sen2list = str(sentences[i]).split()
        sentence_len = len(sen2list)
        sentences[i] = str(sentences[i])
        if sentence_len<pad_size:
            for j in range(pad_size-sentence_len):
                sentences[i]=sentences[i]+PAD
        else:
            sentences[i] = " ".join(sen2list[:pad_size])
    input_batch, target_batch = make_data(sentences, labels)
    input_x = np.array(input_batch)
    target =  np.array(target_batch)
  
    cv(input_x,target)
    
    
        
    



In [7]:
def cv(input_x,target):
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5)
    kf.get_n_splits(input_x)
    for train_index, test_index in kf.split(input_x):
        x_train = input_x[train_index]
        y_train = target[train_index]
        x_test = input_x[test_index]
        y_test = target[test_index]
        time_start = time.perf_counter()
        train_dataset = Data.TensorDataset(torch.LongTensor(x_train), torch.LongTensor(y_train))
        test_dataset = Data.TensorDataset(torch.LongTensor(x_test), torch.LongTensor(y_test))
        
        cnn(train_dataset,test_dataset)
        time_end = time.perf_counter()
        time_sum = time_end - time_start
        print("Time: ",time_sum)
    

In [8]:
cnn_cv("email_preprocess.csv")

success
Accuracy: 14/21 (66.6667%)

Time:  2.06857
Accuracy: 6/21 (28.5714%)

Time:  1.9667982999999998
Accuracy: 13/21 (61.9048%)

Time:  1.9344591000000007
Accuracy: 13/21 (61.9048%)

Time:  2.028944599999999
Accuracy: 14/20 (70.0000%)

Time:  1.9626888999999998


In [9]:
cnn_cv("covid_preprocess.csv")

success
Accuracy: 150/200 (75.0000%)

Time:  3.8709316000000005
Accuracy: 132/200 (66.0000%)

Time:  3.8303532999999987
Accuracy: 141/200 (70.5000%)

Time:  3.840728200000001
Accuracy: 129/200 (64.5000%)

Time:  3.8561229000000026
Accuracy: 124/199 (62.3116%)

Time:  3.8349684999999987


In [10]:
cnn_cv("news_preprocess.csv")

success
Accuracy: 175/200 (87.5000%)

Time:  3.799990400000002
Accuracy: 184/200 (92.0000%)

Time:  3.8106683000000032
Accuracy: 93/200 (46.5000%)

Time:  3.869940100000001
Accuracy: 140/200 (70.0000%)

Time:  3.8367413
Accuracy: 187/199 (93.9698%)

Time:  3.7875736000000018


In [11]:
cnn_cv("twitter_preprocess.csv")

Accuracy: 121/200 (60.5000%)

Time:  3.7566611999999964
Accuracy: 119/200 (59.5000%)

Time:  3.734092199999999
Accuracy: 119/200 (59.5000%)

Time:  3.7519259999999974
Accuracy: 104/200 (52.0000%)

Time:  3.7678377999999952
Accuracy: 118/199 (59.2965%)

Time:  3.7759908000000024


In [12]:
cnn_cv("review_preprocess.csv")

success
Accuracy: 151/200 (75.5000%)

Time:  3.783072699999991
Accuracy: 160/200 (80.0000%)

Time:  3.792300499999996
Accuracy: 148/200 (74.0000%)

Time:  3.761511799999994
Accuracy: 158/200 (79.0000%)

Time:  3.778623499999995
Accuracy: 160/199 (80.4020%)

Time:  3.7596862999999985


In [13]:
cnn_cv("imdb_preprocess.csv")

success
Accuracy: 138/200 (69.0000%)

Time:  3.9992792999999978
Accuracy: 137/200 (68.5000%)

Time:  3.9541651000000115
Accuracy: 123/200 (61.5000%)

Time:  3.9591570999999988
Accuracy: 138/200 (69.0000%)

Time:  3.9260771999999946
Accuracy: 129/199 (64.8241%)

Time:  3.943766999999994


In [14]:
cnn_cv("reddit_preprocess.csv")

success
Accuracy: 80/200 (40.0000%)

Time:  3.819770500000004
Accuracy: 99/200 (49.5000%)

Time:  3.8299894999999964
Accuracy: 96/200 (48.0000%)

Time:  3.8114817000000016
Accuracy: 115/200 (57.5000%)

Time:  3.8146428999999955
Accuracy: 93/199 (46.7337%)

Time:  3.8340946000000002


In [15]:
cnn_cv("paper_preprocess.csv")

success
Accuracy: 3/4 (75.0000%)

Time:  1.7214724000000103
Accuracy: 0/3 (0.0000%)

Time:  1.6971166000000153
Accuracy: 2/3 (66.6667%)

Time:  1.7417875000000151
Accuracy: 2/3 (66.6667%)

Time:  1.711812000000009
Accuracy: 2/3 (66.6667%)

Time:  1.739483799999988


In [16]:
cnn_cv("finance_preprocess.csv")

success
Accuracy: 113/200 (56.5000%)

Time:  3.842991600000005
Accuracy: 119/200 (59.5000%)

Time:  3.889595799999995
Accuracy: 100/200 (50.0000%)

Time:  3.964668500000016
Accuracy: 104/200 (52.0000%)

Time:  3.9238109000000065
Accuracy: 112/199 (56.2814%)

Time:  3.9006114000000025
