In [2]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

#https://www.kaggle.com/aashita/nyt-comments

df = pd.read_csv('../file/ArticlesApril2018.csv')
# df.head()

headline = []
headline.extend(list(df.headline.values))
headline[:5]
# print(len(headline))

headline = [word for word in headline if word != 'Unknown']
# print(len(headline))

# headline[:5]

def repreprocessing(raw_sentence):
    preprocessed_sentence = raw_sentence.encode('utf8').decode('ascii','ignore')
    return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)


sequences = list()

for sentence in preprocessed_headline:
    encoded = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

index_to_word = {}
for key, value in tokenizer.word_index.items():
    index_to_word[value] = key

print('빈도수 상위 582번 단어:{}'.format(index_to_word[582]))


3494
빈도수 상위 582번 단어:offer


In [24]:
max_len = max(len(l) for l in sequences)
print(max_len)

sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

y = to_categorical(y, num_classes=vocab_size)
print(y[:3])
print(X[:3])

24
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  99]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  99 269]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  99 269 371]]


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)


Epoch 1/200
244/244 - 3s - loss: 7.6289 - accuracy: 0.0306
Epoch 2/200
244/244 - 3s - loss: 7.1133 - accuracy: 0.0293
Epoch 3/200
244/244 - 3s - loss: 6.9708 - accuracy: 0.0360
Epoch 4/200
244/244 - 3s - loss: 6.8403 - accuracy: 0.0418
Epoch 5/200
244/244 - 3s - loss: 6.6842 - accuracy: 0.0464
Epoch 6/200
244/244 - 3s - loss: 6.5084 - accuracy: 0.0493
Epoch 7/200
244/244 - 4s - loss: 6.3146 - accuracy: 0.0496
Epoch 8/200
244/244 - 4s - loss: 6.1118 - accuracy: 0.0595
Epoch 9/200
244/244 - 4s - loss: 5.9159 - accuracy: 0.0614
Epoch 10/200
244/244 - 3s - loss: 5.7320 - accuracy: 0.0666
Epoch 11/200
244/244 - 4s - loss: 5.5569 - accuracy: 0.0720
Epoch 12/200
244/244 - 4s - loss: 5.3933 - accuracy: 0.0789
Epoch 13/200
244/244 - 4s - loss: 5.2369 - accuracy: 0.0833
Epoch 14/200
244/244 - 4s - loss: 5.0884 - accuracy: 0.0870
Epoch 15/200
244/244 - 4s - loss: 4.9450 - accuracy: 0.0982
Epoch 16/200
244/244 - 4s - loss: 4.8068 - accuracy: 0.1096
Epoch 17/200
244/244 - 4s - loss: 4.6731 - accura

<tensorflow.python.keras.callbacks.History at 0x1a1f6ea85b0>

In [26]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''

    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
        result = model.predict([encoded], verbose=0)
        result = np.argmax(result, axis=1)
        for word, index in tokenizer.word_index.items():
             if index == result:
                break
        
        current_word = current_word+ ' ' + word
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence


print(sentence_generation(model, tokenizer, 'i', 10))
# i want to be rich and im not sorry with say 

i want to be rich and im not sorry with say


In [29]:
sentence_generation(model, tokenizer, 'how', 10)

'how to make facebook more accountable to live in a bleached'

In [20]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb')

sentences = []

for sentence in f:
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = sentence.decode('ascii','ignore')
    if len(sentence) > 0:
        sentences.append(sentence)
f.close()

total_data = ' '.join(sentences)

char_vocab = sorted(list(set(total_data)))