Skip to content

Commit fb442f0

Browse files
authored
Add files via upload
1 parent ee85063 commit fb442f0

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import numpy as np
2+
import re
3+
import itertools
4+
from collections import Counter
5+
6+
7+
def clean_str(string):
8+
"""
9+
Tokenization/string cleaning for all datasets except for SST.
10+
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
11+
"""
12+
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
13+
string = re.sub(r"\'s", " \'s", string)
14+
string = re.sub(r"\'ve", " \'ve", string)
15+
string = re.sub(r"n\'t", " n\'t", string)
16+
string = re.sub(r"\'re", " \'re", string)
17+
string = re.sub(r"\'d", " \'d", string)
18+
string = re.sub(r"\'ll", " \'ll", string)
19+
string = re.sub(r",", " , ", string)
20+
string = re.sub(r"!", " ! ", string)
21+
string = re.sub(r"\(", " \( ", string)
22+
string = re.sub(r"\)", " \) ", string)
23+
string = re.sub(r"\?", " \? ", string)
24+
string = re.sub(r"\s{2,}", " ", string)
25+
return string.strip().lower()
26+
27+
28+
def load_data_and_labels(positive_data_file, negative_data_file):
29+
"""
30+
Loads MR polarity data from files, splits the data into words and generates labels.
31+
Returns split sentences and labels.
32+
"""
33+
# Load data from files
34+
positive_examples = list(open(positive_data_file, "r", encoding="utf-8").readlines())
35+
positive_examples = [s.strip() for s in positive_examples]
36+
negative_examples = list(open(negative_data_file, "r", encoding="utf-8").readlines())
37+
negative_examples = [s.strip() for s in negative_examples]
38+
# Split by words
39+
x_text = positive_examples + negative_examples
40+
x_text = [clean_str(sent) for sent in x_text]
41+
# Generate labels
42+
positive_labels = [[0, 1] for _ in positive_examples]
43+
negative_labels = [[1, 0] for _ in negative_examples]
44+
y = np.concatenate([positive_labels, negative_labels], 0)
45+
return [x_text, y]
46+
47+
48+
def batch_iter(data, batch_size, num_epochs, shuffle=True):
49+
"""
50+
Generates a batch iterator for a dataset.
51+
"""
52+
data = np.array(data)
53+
data_size = len(data)
54+
num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
55+
for epoch in range(num_epochs):
56+
# Shuffle the data at each epoch
57+
if shuffle:
58+
shuffle_indices = np.random.permutation(np.arange(data_size))
59+
shuffled_data = data[shuffle_indices]
60+
else:
61+
shuffled_data = data
62+
for batch_num in range(num_batches_per_epoch):
63+
start_index = batch_num * batch_size
64+
end_index = min((batch_num + 1) * batch_size, data_size)
65+
yield shuffled_data[start_index:end_index]
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""
2+
Test the TextRNN class
3+
2016/12/22
4+
"""
5+
import os
6+
import sys
7+
import numpy as np
8+
import tensorflow as tf
9+
from sklearn.model_selection import train_test_split
10+
from tensorflow.contrib import learn
11+
12+
from data_helpers import load_data_and_labels, batch_iter
13+
from text_cnn import TextCNN
14+
15+
16+
# Load original data
17+
path = sys.path[0]
18+
pos_filename = path + "/data/rt-polarity.pos"
19+
neg_filename = path + "/data/rt-polarity.neg"
20+
21+
X_data, y_data = load_data_and_labels(pos_filename, neg_filename)
22+
max_document_length = max([len(sen.split(" ")) for sen in X_data])
23+
print("Max_document_length:,", max_document_length)
24+
# Create the vacabulary
25+
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
26+
# The idx data
27+
x = np.array(list(vocab_processor.fit_transform(X_data)), dtype=np.float32)
28+
y = np.array(y_data, dtype=np.int32)
29+
vocabulary_size = len(vocab_processor.vocabulary_)
30+
print("The size of vocabulary:", vocabulary_size)
31+
# Split the data
32+
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1111)
33+
print("X_train shape {0}, y_train shape {1}".format(X_train.shape, y_train.shape))
34+
print("X_test shape {0}, y_test shape {1}".format(X_test.shape, y_test.shape))
35+
36+
# The parameters of RNN
37+
seq_len = X_train.shape[1]
38+
vocab_size = vocabulary_size
39+
embedding_size = 128
40+
filter_sizes = [2, 3, 4]
41+
num_filters = 128
42+
num_classes = y_train.shape[1]
43+
l2_reg_lambda = 0.0
44+
45+
# Construct RNN model
46+
text_rnn_model = TextCNN(seq_len=seq_len, vocab_size=vocab_size, embedding_size=embedding_size, filter_sizes=
47+
filter_sizes, num_filters=num_filters, num_classes=num_classes)
48+
loss = text_rnn_model.loss
49+
train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
50+
accuracy = text_rnn_model.accuracy
51+
# The parameters for training
52+
batch_size = 64
53+
training_epochs = 10
54+
dispaly_every = 1
55+
dropout_keep_prob = 0.5
56+
57+
batch_num = int(X_train.shape[0]/batch_size)
58+
59+
sess = tf.Session()
60+
sess.run(tf.global_variables_initializer())
61+
print("Starting training...")
62+
for epoch in range(training_epochs):
63+
avg_cost = 0
64+
for batch in range(batch_num):
65+
_, cost = sess.run([train_op, loss], feed_dict={text_rnn_model.x: X_train[batch*batch_size:(batch+1)*batch_size],
66+
text_rnn_model.y: y_train[batch*batch_size:(batch+1)*batch_size],
67+
text_rnn_model.dropout_keep_prob:dropout_keep_prob})
68+
avg_cost += cost
69+
if epoch % dispaly_every == 0:
70+
cost, acc = sess.run([loss, accuracy], feed_dict={text_rnn_model.x: X_test,
71+
text_rnn_model.y: y_test,
72+
text_rnn_model.dropout_keep_prob: 1.0})
73+
print("\nEpoch {0} : loss {1}, accuracy {2}".format(epoch, cost, acc))
74+

0 commit comments

Comments
 (0)