<a href="https://colab.research.google.com/github/ysj9909/NLP_practice/blob/main/TextCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Convolutional Neural Networks for Sentence Classification 코드 구현 연습**

 * paper link : https://aclanthology.org/D14-1181.pdf
 * code reference : https://github.com/graykode/nlp-tutorial/blob/master/2-1.TextCNN/TextCNN.ipynb


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import torch
import torch.nn as nn

* dataset : MR
: : Movie reviews with one sentence per review. Classification involves detecting positive/negative reviews (Pang and Lee, 2005).

In [None]:
import os
os.chdir("./drive/My Drive/pytorch-tutorial/nlp_practice/txt_sentoken")
neg_txt_files = os.listdir("./neg")
print(neg_txt_files)


pos_txt_files = os.listdir("./pos")
print(pos_txt_files)

['cv000_29416.txt', 'cv001_19502.txt', 'cv002_17424.txt', 'cv003_12683.txt', 'cv004_12641.txt', 'cv005_29357.txt', 'cv006_17022.txt', 'cv007_4992.txt', 'cv008_29326.txt', 'cv009_29417.txt', 'cv010_29063.txt', 'cv011_13044.txt', 'cv012_29411.txt', 'cv013_10494.txt', 'cv014_15600.txt', 'cv015_29356.txt', 'cv016_4348.txt', 'cv017_23487.txt', 'cv018_21672.txt', 'cv019_16117.txt', 'cv020_9234.txt', 'cv021_17313.txt', 'cv022_14227.txt', 'cv023_13847.txt', 'cv024_7033.txt', 'cv025_29825.txt', 'cv026_29229.txt', 'cv027_26270.txt', 'cv028_26964.txt', 'cv029_19943.txt', 'cv030_22893.txt', 'cv031_19540.txt', 'cv032_23718.txt', 'cv033_25680.txt', 'cv034_29446.txt', 'cv035_3343.txt', 'cv036_18385.txt', 'cv037_19798.txt', 'cv038_9781.txt', 'cv039_5963.txt', 'cv040_8829.txt', 'cv041_22364.txt', 'cv042_11927.txt', 'cv043_16808.txt', 'cv044_18429.txt', 'cv045_25077.txt', 'cv046_10613.txt', 'cv047_18725.txt', 'cv048_18380.txt', 'cv049_21917.txt']
['cv000_29590.txt', 'cv001_18431.txt', 'cv002_15918.txt',

In [None]:
pos_reviews = []
neg_reviews = []

for file in pos_txt_files:
  with open("./pos/" + file, "r") as f:
    reviews = f.read().split("\n")
    reviews = reviews[:-1]
    pos_reviews += reviews
    f.close()

for file in neg_txt_files:
  with open("./neg/" + file, "r") as f:
    reviews = f.read().split("\n")
    reviews = reviews[:-1]
    neg_reviews += reviews
    f.close()


In [None]:
print(len(pos_reviews))
print(len(neg_reviews))
# pos / neg review의 개수를 1500개로 맞춰준다.
pos_reviews = pos_reviews[:1500]
neg_reviews = neg_reviews[:1500]

1607
1541


In [None]:
# import pre-trained word2vec model
import gensim 
from urllib.request import urlretrieve, urlopen

urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", \
                           filename="GoogleNews-vectors-negative300.bin.gz")
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:

words = []
for line in (pos_reviews + neg_reviews):
  ws = line.split(" ")
  words += ws
words = list(set(words))
vocab_size = len(words)

word2idx = {}
idx2word = {}
idx = 1    # pad token's idx is 0
for word in words:
  word2idx[word] = idx
  idx2word[idx] = word
  idx += 1


Embedding = np.zeros((vocab_size + 1, 300))    # (vocab_size + 1, 300)
Embedding[0] = np.random.uniform(-0.25, 0.25, size = 300)     # pad token's dense vector
for word in words:
  if word in word2vec_model:
    Embedding[word2idx[word]] = word2vec_model[word]
  else:
    Embedding[word2idx[word]] = np.random.uniform(-0.25, 0.25, size = 300)


In [None]:
X = np.zeros((3000, 50), dtype = int)   # review's max length is 50    
for t, line in enumerate(pos_reviews + neg_reviews):
  ws = line.split(" ")
  for i, w in enumerate(ws):
    if i < 50:
      X[t][i] = word2idx[w]

inputs = np.zeros((3000, 50, 300))
for i in range(3000):
  for j in range(50):
    inputs[i][j] = Embedding[X[i][j]]

y = np.zeros((3000, ))     # (3000, )
y[:1500] = 1

# negative reviews와 positive reviews를 무작위로 섞어 준다.
rand_ind = np.arange(3000)
np.random.shuffle(rand_ind)

inputs = inputs[rand_ind]
y = y[rand_ind]

inputs = torch.Tensor(inputs).unsqueeze(1)  # (3000, 1, 50, 300)
targets = torch.LongTensor(y)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyper-parameters
num_epochs = 50
learning_rate = 0.001
batch_size = 50

In [None]:
# CNN - static Model
class textCNN(nn.Module):
  def __init__(self, filter_windows, num_filters, dropout_ratio = 0.5):
    super(textCNN, self).__init__()
    self.filter_windows = filter_windows
    self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters, kernel_size = (filter_windows[i], 300)) for i in range(len(filter_windows))])
    self.fc = nn.Linear(3 * num_filters, 2)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p = dropout_ratio)

  def forward(self, inputs):
    pooled_outputs = []
    for i, conv in enumerate(self.filter_list):
      h = self.relu(conv(inputs))    # (batch_size, num_filters, max_length - filter_window + 1, 1)
      mp = nn.MaxPool2d((50 - self.filter_windows[i] + 1, 1))  
      pooled = mp(h).squeeze(3).squeeze(2)     # (batch_size, num_filters)
      pooled_outputs.append(pooled)

    total_conv_output = torch.cat(pooled_outputs, dim = -1)      # (batch_size, 300)
    outputs = self.fc(self.dropout(total_conv_output))

    return outputs


In [None]:
model = textCNN([3, 4, 5], 100).to(device)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Train the model
for epoch in range(num_epochs):
  for i in range(0, 3000 - batch_size, batch_size):
    input = inputs[i: i + batch_size].to(device)
    target = targets[i : i + batch_size].to(device)

    outputs = model(input)
    loss = criterion(outputs, target)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i % 100 == 0:
      print(f"Epoch [{epoch + 1} / {num_epochs}], Step [{i} / 3000], Loss : {loss.item()}")
  


Epoch [1 / 50], Step [0 / 3000], Loss : 0.6940584778785706
Epoch [1 / 50], Step [100 / 3000], Loss : 0.6963173747062683
Epoch [1 / 50], Step [200 / 3000], Loss : 0.6821169257164001
Epoch [1 / 50], Step [300 / 3000], Loss : 0.6876261830329895
Epoch [1 / 50], Step [400 / 3000], Loss : 0.7338733077049255
Epoch [1 / 50], Step [500 / 3000], Loss : 0.7095551490783691
Epoch [1 / 50], Step [600 / 3000], Loss : 0.6877036094665527
Epoch [1 / 50], Step [700 / 3000], Loss : 0.6826093196868896
Epoch [1 / 50], Step [800 / 3000], Loss : 0.68904709815979
Epoch [1 / 50], Step [900 / 3000], Loss : 0.6641324758529663
Epoch [1 / 50], Step [1000 / 3000], Loss : 0.7174859046936035
Epoch [1 / 50], Step [1100 / 3000], Loss : 0.6510122418403625
Epoch [1 / 50], Step [1200 / 3000], Loss : 0.7125987410545349
Epoch [1 / 50], Step [1300 / 3000], Loss : 0.6606968641281128
Epoch [1 / 50], Step [1400 / 3000], Loss : 0.7286843061447144
Epoch [1 / 50], Step [1500 / 3000], Loss : 0.6621492505073547
Epoch [1 / 50], Step [

In [None]:
# Test the model
review = "this movie is so fun"
ws = review.split(" ")
w_list = [word2idx[word] for word in ws] + [0] * (50 - len(ws))
input = np.zeros((50, 300))
for i in range(50):
  idx = w_list[i]
  input[i] = Embedding[idx]
input = torch.Tensor(input).unsqueeze(0).unsqueeze(1).to(device)

output = model(input)

predict = output.data.max(1, keepdim = True)[1]
if predict[0][0] == 0:
  print(review, " is negative Mean..")
else:
  print(review, "is positive Mean!!")

this movie is so fun is positive Mean!!
