## 필요한 모듈  import 

In [1]:
import pandas as pd
import numpy as np
import warnings
import os
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
SEED = 33

## 데이터 로드

In [2]:
DATA = './data'

In [3]:
train = pd.read_csv(os.path.join(DATA, 'train.tsv'), delimiter='\t')
test = pd.read_csv(os.path.join(DATA, 'test.tsv'), delimiter='\t')

In [4]:
unlabeled_train = pd.read_csv(os.path.join(DATA, 'unlabeled-train.tsv'), delimiter='\t', error_bad_lines=False)

b'Skipping line 43043: expected 2 fields, saw 3\n'


In [5]:
print(train.shape)
train.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
print(test.shape)
test.head()

(25000, 2)


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [7]:
print(unlabeled_train.shape)
unlabeled_train.head()

(49998, 2)


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


## 전처리

In [8]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

In [9]:
eng_stopwords = stopwords.words('english')

## Lemmatizer

In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
print(lemmatizer.lemmatize('runs'))
print(lemmatizer.lemmatize('ran'))
print(lemmatizer.lemmatize('run'))

print(lemmatizer.lemmatize('apple'))
print(lemmatizer.lemmatize('apples'))

run
ran
run
apple
apple


In [13]:
def process_lemma(sentence):        
    return [lemmatizer.lemmatize(word, 'v') for word in sentence]

In [14]:
def preprocessing(sentence):
    soup = BeautifulSoup(sentence, 'html.parser')
    cleaned = re.sub('[^a-zA-Z]', ' ', soup.text)
    cleaned = cleaned.lower()
    cleaned = [word for word in cleaned.split() if word not in eng_stopwords]
    cleaned = process_lemma(cleaned)
    return ' '.join(cleaned)

In [15]:
all_review = pd.concat([train['review'], unlabeled_train['review'], test['review']])

In [16]:
all_review_clean = all_review.apply(preprocessing)

In [17]:
all_review_clean.head()

0    stuff go moment mj start listen music watch od...
1    classic war worlds timothy hines entertain fil...
2    film start manager nicholas bell give welcome ...
3    must assume praise film greatest film opera ev...
4    superbly trashy wondrously unpretentious explo...
Name: review, dtype: object

## Tokenizer

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [19]:
tokenizer = Tokenizer(oov_token='<OOV>')

In [20]:
tokenizer.fit_on_texts(all_review_clean)

In [21]:
len(tokenizer.word_index)

126309

In [22]:
for i, word in enumerate(tokenizer.word_index):
    if i > 20:
        break
    print(word, tokenizer.word_index[word])

<OOV> 1
film 2
movie 3
one 4
make 5
like 6
see 7
get 8
time 9
good 10
character 11
go 12
watch 13
even 14
would 15
think 16
story 17
really 18
well 19
show 20
look 21


In [23]:
train_sentences = all_review_clean[:len(train)]
test_sentences = all_review_clean[-len(test):]

In [24]:
train_sentences.shape, test_sentences.shape

((25000,), (25000,))

In [25]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [26]:
train_sequences[0]

[397,
 12,
 463,
 11594,
 83,
 930,
 127,
 13,
 895,
 507,
 13,
 21106,
 13,
 19435,
 179,
 46,
 8,
 639,
 2250,
 66,
 16,
 18,
 469,
 3272,
 179,
 5,
 188,
 643,
 2110,
 1155,
 19435,
 58,
 4431,
 58,
 258,
 2,
 240,
 12,
 7,
 349,
 1643,
 255,
 1145,
 550,
 11594,
 59,
 773,
 2039,
 29,
 470,
 550,
 593,
 26,
 4231,
 1924,
 1032,
 175,
 420,
 1453,
 782,
 2209,
 6,
 11594,
 459,
 12,
 613,
 37,
 170,
 116,
 146,
 11594,
 34889,
 9295,
 5,
 3,
 11594,
 109,
 15,
 25,
 5,
 109,
 198,
 18,
 253,
 727,
 258,
 2,
 114,
 339,
 83,
 141,
 7788,
 3475,
 1502,
 311,
 781,
 6908,
 526,
 9123,
 785,
 593,
 1370,
 46,
 11594,
 242,
 26,
 558,
 11594,
 9786,
 505,
 12451,
 781,
 6908,
 11,
 3763,
 46,
 27,
 24,
 2666,
 593,
 413,
 8743,
 179,
 724,
 11594,
 127,
 64,
 469,
 94,
 6,
 11594,
 90,
 419,
 1904,
 130,
 1523,
 2147,
 311,
 29,
 68,
 113,
 3929,
 3388,
 36,
 2,
 22088,
 26,
 311,
 516,
 843,
 613,
 43,
 4,
 129,
 152,
 518,
 130,
 630,
 890,
 1120,
 423,
 55,
 1131,
 107,
 3,
 27,
 6,
 

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
MAX_LENGTH = 150

In [29]:
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, truncating='post', padding='post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, truncating='post', padding='post')

In [30]:
train_padded.shape, test_padded.shape

((25000, 150), (25000, 150))

In [31]:
train_labels = train['sentiment']

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
x_train, x_valid, y_train, y_valid = train_test_split(train_padded, train_labels, stratify=train_labels, test_size=0.1, random_state=SEED)

## Word2Vec

In [34]:
from gensim.models import KeyedVectors

In [37]:
word2vec = KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

In [40]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

In [45]:
for word, idx in tokenizer.word_index.items():
    embedding_vector = word2vec[word] if word in word2vec else None
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [46]:
embedding_matrix.shape

(126310, 300)

In [54]:
tokenizer.word_index

{'<OOV>': 1,
 'film': 2,
 'movie': 3,
 'one': 4,
 'make': 5,
 'like': 6,
 'see': 7,
 'get': 8,
 'time': 9,
 'good': 10,
 'character': 11,
 'go': 12,
 'watch': 13,
 'even': 14,
 'would': 15,
 'think': 16,
 'story': 17,
 'really': 18,
 'well': 19,
 'show': 20,
 'look': 21,
 'much': 22,
 'end': 23,
 'know': 24,
 'say': 25,
 'bad': 26,
 'people': 27,
 'great': 28,
 'also': 29,
 'first': 30,
 'take': 31,
 'give': 32,
 'act': 33,
 'play': 34,
 'love': 35,
 'come': 36,
 'find': 37,
 'way': 38,
 'could': 39,
 'movies': 40,
 'seem': 41,
 'plot': 42,
 'work': 43,
 'two': 44,
 'many': 45,
 'want': 46,
 'never': 47,
 'life': 48,
 'try': 49,
 'best': 50,
 'little': 51,
 'ever': 52,
 'man': 53,
 'better': 54,
 'scene': 55,
 'still': 56,
 'scenes': 57,
 'part': 58,
 'feel': 59,
 'something': 60,
 'use': 61,
 'back': 62,
 'interest': 63,
 'lot': 64,
 'real': 65,
 'guy': 66,
 'thing': 67,
 'director': 68,
 'actors': 69,
 'funny': 70,
 'though': 71,
 'cast': 72,
 'star': 73,
 'years': 74,
 'live': 75,
 

In [55]:
embedding_matrix[3]

array([ 0.17480469, -0.10986328, -0.20019531,  0.26757812, -0.06396484,
        0.06689453,  0.07958984,  0.08398438,  0.12695312,  0.11621094,
        0.11523438, -0.13867188, -0.08203125, -0.00143433, -0.19824219,
        0.13574219, -0.03955078,  0.06933594, -0.2265625 , -0.20019531,
        0.03076172,  0.16015625, -0.04174805,  0.00427246,  0.09619141,
       -0.03320312,  0.02783203,  0.02124023,  0.13867188, -0.02075195,
       -0.31835938, -0.08837891, -0.23828125,  0.02490234,  0.06787109,
       -0.18066406,  0.27148438,  0.16210938,  0.04614258,  0.20410156,
        0.22949219, -0.03710938,  0.140625  ,  0.12890625, -0.22558594,
        0.03857422, -0.01300049,  0.00582886,  0.23144531,  0.1015625 ,
       -0.10351562, -0.10351562, -0.2578125 ,  0.16503906,  0.03686523,
       -0.32421875,  0.02893066, -0.11914062, -0.19238281,  0.00086594,
        0.06591797,  0.265625  , -0.15917969,  0.26171875, -0.18359375,
        0.13085938, -0.25      , -0.05541992,  0.27929688, -0.06

## Model

In [56]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

In [57]:
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH, 
              weights=[embedding_matrix], 
              trainable=False,
             ), 
    Bidirectional(LSTM(128, return_sequences=True)), 
    Bidirectional(LSTM(128)), 
    Dropout(0.25), 
    Dense(32, activation='relu'), 
    Dense(1, activation='sigmoid')
])

In [58]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 300)          37893000  
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 256)          439296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 38,734,793
Trainable params: 841,793
Non-trainable params: 37,893,000
______________________________________

In [59]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [60]:
checkpoint_path = 'tmp/checkpoint.ckpt'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, 
                             save_best_only=True, 
                             save_weights_only=True, 
                             monitor='val_loss', 
                             verbose=1,
                            )

In [61]:
model.fit(x_train, y_train, 
          validation_data=(x_valid, y_valid), 
          batch_size=128, 
          epochs=20, 
          callbacks=[checkpoint]
         )

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.41215, saving model to tmp/checkpoint.ckpt
Epoch 2/20
Epoch 00002: val_loss improved from 0.41215 to 0.40678, saving model to tmp/checkpoint.ckpt
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.40678
Epoch 4/20
Epoch 00004: val_loss improved from 0.40678 to 0.34816, saving model to tmp/checkpoint.ckpt
Epoch 5/20
Epoch 00005: val_loss improved from 0.34816 to 0.34013, saving model to tmp/checkpoint.ckpt
Epoch 6/20
Epoch 00006: val_loss improved from 0.34013 to 0.31512, saving model to tmp/checkpoint.ckpt
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.31512
Epoch 8/20
Epoch 00008: val_loss improved from 0.31512 to 0.31192, saving model to tmp/checkpoint.ckpt
Epoch 9/20
Epoch 00009: val_loss improved from 0.31192 to 0.31036, saving model to tmp/checkpoint.ckpt
Epoch 10/20
Epoch 00010: val_loss did not improve from 0.31036
Epoch 11/20
Epoch 00011: val_loss did not improve from 0.31036
Epoch 12/20
Epoch 00012: val_loss

<tensorflow.python.keras.callbacks.History at 0x7f00f44e7668>

In [62]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f03081476a0>

In [63]:
model.evaluate(x_valid, y_valid)



[0.3103611469268799, 0.8655999898910522]

In [64]:
prediction = model.predict(test_padded)

In [65]:
prediction

array([[0.98802197],
       [0.02776259],
       [0.47275648],
       ...,
       [0.19362463],
       [0.99433774],
       [0.4398437 ]], dtype=float32)

In [66]:
prediction[prediction >= 0.5] = 1
prediction[prediction < 0.5] = 0

In [67]:
prediction

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [1.],
       [0.]], dtype=float32)

## Submission

In [68]:
submission = pd.read_csv(os.path.join(DATA, 'sampleSubmission.csv'))

In [69]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [70]:
submission['sentiment'] = prediction

In [71]:
submission['sentiment'].value_counts()

0.0    13750
1.0    11250
Name: sentiment, dtype: int64

In [72]:
submission['sentiment'] = submission['sentiment'].astype('int')

In [73]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [74]:
import datetime

In [75]:
timestring = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

In [76]:
filename = f'submission/submission-{timestring}.csv'

In [77]:
filename

'submission/submission-2020-10-01-15-05-39.csv'

In [78]:
submission.to_csv(filename, index=False)