In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
 # TextCNN Parameter
num_classes = 3
batch_size = 64
word_list = []
vocab = []
word2idx = []
vocab_size = 0

In [3]:
def make_data(sentences, labels):
    # TextCNN Parameter
    global num_classes 
    num_classes = len(set(labels))  # num_classes=2
    global batch_size 
    batch_size = 64
    global word_list 
    word_list = " ".join(sentences).split()
    global vocab
    vocab = list(set(word_list))
    global word2idx
    word2idx = {w: i for i, w in enumerate(vocab)}
    global vocab_size
    vocab_size = len(vocab)
    inputs = []
    for sen in sentences:
        inputs.append([word2idx[n] for n in sen.split()])
    targets = []
    for out in labels:
        targets.append(out) # To using Torch Softmax Loss function
    return inputs, targets

In [4]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.filter_sizes = (2, 3, 4)
        self.embed = 300
        self.num_filters = 256
        self.dropout = 0.5
        self.num_classes = num_classes
        self.n_vocab = vocab_size
        #The  character is padded to 0 by padding_idx</pad>
        self.embedding = nn.Embedding(self.n_vocab, self.embed, padding_idx=word2idx['<PAD>'])
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, self.num_filters, (k, self.embed)) for k in self.filter_sizes])
        
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.num_filters * len(self.filter_sizes), self.num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, x):
        out = self.embedding(x)
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [5]:
def cnn(train_dataset,test_dataset):
    train_loader = Data.DataLoader(
    dataset=train_dataset,      # Data, encapsulated in the data.tensorDataset()
    batch_size=batch_size,      # size
    shuffle=True,               
    num_workers=2,              # multiprocess
    )
    test_loader = Data.DataLoader(
        dataset=test_dataset,      
        batch_size=batch_size,      
        shuffle=True,               
        num_workers=2, 
    )
    model = TextCNN().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Training
    for epoch in range(1):
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            if (epoch + 1) % 10 == 0:
                print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    test_acc_list = []
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            pred = output.max(1, keepdim=True)[1]                           
            correct += pred.eq(target.view_as(pred)).sum().item()

    # test_loss /= len(test_loader.dataset)
    # test_loss_list.append(test_loss)
    test_acc_list.append(100. * correct / len(test_loader.dataset))
    print('Accuracy: {}/{} ({:.0f}%)\n'.format(correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))

In [6]:
def cnn_cv(df_name):
    df = pd.read_csv(df_name,header=0)
    df = df[1:1000]
    sentences = list(df['sw_exclude'])
    labels = list(df['sentiment'])    
    l = []
    if -1 in labels:
        num_classes = len(set(labels))
        if num_classes == 2:
            for i in labels:
                if i == -1:
                    l.append(0)
                if i == 1:
                    l.append(1)
        if num_classes == 3:
            for i in labels:
                if i == -1:
                    l.append(2)
                if i == 0:
                    l.append(0)
                if i== 1:
                    l.append(1)

        labels = l
        print('success')
    else:
        labels = labels
    PAD = ' <PAD>'  # Fill in sentences of different lengths
    pad_size =  64     # Fill as the same length

    for i in range(len(sentences)):
        sen2list = str(sentences[i]).split()
        sentence_len = len(sen2list)
        if sentence_len<pad_size:
            sentences[i] += PAD*(pad_size-sentence_len)
        else:
            sentences[i] = " ".join(sen2list[:pad_size])
    input_batch, target_batch = make_data(sentences, labels)
    input_x = np.array(input_batch)
    target =  np.array(target_batch)
  
    cv(input_x,target)
    
    
        
    



In [7]:
def cv(input_x,target):
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5)
    kf.get_n_splits(input_x)
    for train_index, test_index in kf.split(input_x):
        x_train = input_x[train_index]
        y_train = target[train_index]
        x_test = input_x[test_index]
        y_test = target[test_index]
        train_dataset = Data.TensorDataset(torch.LongTensor(x_train), torch.LongTensor(y_train))
        test_dataset = Data.TensorDataset(torch.LongTensor(x_test), torch.LongTensor(y_test))
        cnn(train_dataset,test_dataset)
    

In [8]:
cnn_cv("email_preprocess.csv")

success
Accuracy: 14/21 (67%)

Accuracy: 6/21 (29%)

Accuracy: 13/21 (62%)

Accuracy: 13/21 (62%)

Accuracy: 14/20 (70%)



In [9]:
cnn_cv("covid_preprocess.csv")

success
Accuracy: 125/200 (62%)

Accuracy: 139/200 (70%)

Accuracy: 118/200 (59%)

Accuracy: 131/200 (66%)

Accuracy: 133/199 (67%)



In [10]:
cnn_cv("news_preprocess.csv")

success
Accuracy: 175/200 (88%)

Accuracy: 183/200 (92%)

Accuracy: 97/200 (48%)

Accuracy: 143/200 (72%)

Accuracy: 121/199 (61%)



In [11]:
cnn_cv("twitter_preprocess.csv")

Accuracy: 117/200 (58%)

Accuracy: 133/200 (66%)

Accuracy: 128/200 (64%)

Accuracy: 124/200 (62%)

Accuracy: 115/199 (58%)



In [12]:
cnn_cv("review_preprocess.csv")

success
Accuracy: 151/200 (76%)

Accuracy: 161/200 (80%)

Accuracy: 149/200 (74%)

Accuracy: 158/200 (79%)

Accuracy: 160/199 (80%)



In [13]:
cnn_cv("imdb_preprocess.csv")

success
Accuracy: 96/200 (48%)

Accuracy: 128/200 (64%)

Accuracy: 115/200 (58%)

Accuracy: 122/200 (61%)

Accuracy: 126/199 (63%)

