In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# 1.Import

In [2]:
df = pd.read_csv('imdb_preprocess.csv',header=0)
print('All:{}data'.format(len(df)))

All:10000data


In [3]:
df.head()

Unnamed: 0,ori_text,sw_include,sw_exclude,sentiment
0,I really liked this Summerslam due to the look...,i really like this summerslam due to the look ...,really like summerslam due look arena curtain ...,1
1,Not many television shows appeal to quite as m...,not many television show appeal to quite a man...,many television show appeal quite many differe...,1
2,The film quickly gets to a major chase scene w...,the film quickly get to a major chase scene wi...,film quickly get major chase scene ever increa...,-1
3,Jane Austen would definitely approve of this o...,jane austen would definitely approve of this o...,jane austen would definitely approve one gwyne...,1
4,Expectations were somewhat high for me when I ...,expectation be somewhat high for me when i go ...,expectation somewhat high go see movie think s...,-1


In [4]:
columns = df.columns
columns

Index(['ori_text', 'sw_include', 'sw_exclude', 'sentiment'], dtype='object')

# 2.Clean the data

In [5]:
sentences = list(df['ori_text'])
labels = list(df['sentiment'])
type(labels)
l = []
num_classes = len(set(labels))
if num_classes == 2: #II:+1\-1
    for i in labels:
        if i == -1:
            l.append(0)
        if i  == 1:
            l.append(1)
    
if num_classes == 3:
    for i in labels:
        if i == -1:
            l.append(2)
        if i == 0:
            l.append(0)
        if i == 1:
            l.append(1)
       
labels = l

In [6]:
PAD = ' <PAD>'  # Fill in sentences of different lengths
pad_size =  64     # Fill as the same length

for i in range(len(sentences)):
    sen2list = str(sentences[i]).split()
    sentence_len = len(sen2list)
    if sentence_len<pad_size:
        sentences[i] += PAD*(pad_size-sentence_len)
    else:
        sentences[i] = " ".join(sen2list[:pad_size])


# 3.TextCnn

In [7]:
# TextCNN Parameter
num_classes = len(set(labels))  # num_classes=2
batch_size = 64
word_list = " ".join(sentences).split()
vocab = list(set(word_list))
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(vocab)

In [8]:
def make_data(sentences, labels):
    inputs = []
    for sen in sentences:
        inputs.append([word2idx[n] for n in sen.split()])

    targets = []
    for out in labels:
        targets.append(out) # To using Torch Softmax Loss function
    return inputs, targets
input_batch, target_batch = make_data(sentences, labels)
input_x = np.array(input_batch)
target =  np.array(target_batch)
input_batch, target_batch = torch.LongTensor(input_batch), torch.LongTensor(target_batch)


In [9]:
from sklearn.model_selection import train_test_split
#Split the train and test sets
x_train,x_test,y_train,y_test = train_test_split(input_x,target,test_size=0.2,random_state = 0)

train_dataset = Data.TensorDataset(torch.LongTensor(x_train), torch.LongTensor(y_train))
test_dataset = Data.TensorDataset(torch.LongTensor(x_test), torch.LongTensor(y_test))
dataset = Data.TensorDataset(input_batch, target_batch)

In [10]:
train_loader = Data.DataLoader(
    dataset=train_dataset,      # Data, encapsulated in the data.tensorDataset()
    batch_size=batch_size,      # size
    shuffle=True,               
    num_workers=2,              # multiprocess
)
test_loader = Data.DataLoader(
    dataset=test_dataset,      
    batch_size=batch_size,      
    shuffle=True,               
    num_workers=2, 
)


In [11]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.filter_sizes = (2, 3, 4)
        self.embed = 300
        self.num_filters = 256
        self.dropout = 0.5
        self.num_classes = num_classes
        self.n_vocab = vocab_size
        #The  character is padded to 0 by padding_idx</pad>
        self.embedding = nn.Embedding(self.n_vocab, self.embed, padding_idx=word2idx['<PAD>'])
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, self.num_filters, (k, self.embed)) for k in self.filter_sizes])
        
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.num_filters * len(self.filter_sizes), self.num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, x):
        out = self.embedding(x)
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out


# 4. build the Model

In [17]:
model = TextCNN().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(1):
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
        if (epoch + 1) % 10 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [18]:
test_acc_list = []
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)

        pred = output.max(1, keepdim=True)[1]                           
        correct += pred.eq(target.view_as(pred)).sum().item()

# test_loss /= len(test_loader.dataset)
# test_loss_list.append(test_loss)
test_acc_list.append(100. * correct / len(test_loader.dataset))
print('Accuracy: {}/{} ({:.0f}%)\n'.format(correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))


Accuracy: 1362/2000 (68%)



In [19]:
torch.save(model,'Macintosh HD/users/Dell6/Desktop/1/model_1.pt')

FileNotFoundError: [Errno 2] No such file or directory: 'Macintosh HD/users/Dell6/Desktop/1/model_1.pt'

# 5. Save the model

In [14]:
#torch.save(model, PATH)

# 6.Load the model

In [15]:
#model = torch.load(PATH)
#model.eval()