Skip to content

Commit

Permalink
working ABS code; TODO: insert EOS token for periods
Browse files Browse the repository at this point in the history
  • Loading branch information
bh2590 committed Dec 10, 2018
1 parent 11926bb commit 63615a6
Show file tree
Hide file tree
Showing 10 changed files with 114,160 additions and 99,755 deletions.
9,974 changes: 0 additions & 9,974 deletions abs_test_set_8.csv

Large diffs are not rendered by default.

11,396 changes: 11,396 additions & 0 deletions abs_test_set_8_all.csv

Large diffs are not rendered by default.

89,654 changes: 0 additions & 89,654 deletions abs_train_set_8.csv

Large diffs are not rendered by default.

102,553 changes: 102,553 additions & 0 deletions abs_train_set_8_all.csv

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
help='whether to initialize from a pretrained embedding matrix')

parser.add_argument('--cold_start', type=ast.literal_eval,
default=True,
default=False,
help='whether to overwrite existing model_dir')

parser.add_argument('--prepare_embeddings', type=ast.literal_eval,
Expand All @@ -93,6 +93,9 @@
default=True,
help='whether embeddings have been preprocessed and dont need to be extracted again for all reviews')

parser.add_argument('--num_products_per_batch', type=int,
default=32,
help='the number of products per batch for ABS; actual batch size will be num_products_per_batch*abs_num_reviews')

args = parser.parse_args()

Expand Down
53 changes: 37 additions & 16 deletions main_abs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
import dill as pickle
import json

from model_data import train_input_fn
from model_data import train_input_fn, prepare_df
from modules import my_model
from config import args
os.environ['CUDA_VISIBLE_DEVICES']= "-1"
os.environ['CUDA_VISIBLE_DEVICES']= "0"

def make_config():
with open('cache/tokenizer.pkl', 'rb') as fi:
Expand Down Expand Up @@ -44,48 +44,68 @@ def make_config():
return params


def id_to_text(tokenizer, id_list):
max_lens= [len(seq) if 0 not in seq else seq.index(0) for seq in id_list]
words_list= tokenizer.sequences_to_texts([ids[:max_lens[i]] for i, ids in enumerate(id_list)])
return words_list


def train_model(classifier, params, train_filename):
features_df, word_ids= prepare_df(asins2use_file= train_filename)
# Train the Model.
if args.debug == True:
maxsteps= None
else:
maxsteps= None
classifier.train(
input_fn=lambda: train_input_fn(asins2use_file= train_filename),
input_fn=lambda: train_input_fn(features_df, word_ids),
steps=maxsteps)


def test_model(classifier, params, test_filename):
features_df, word_ids= prepare_df(asins2use_file= test_filename)
# Test the Model.
predictions = classifier.predict(input_fn=lambda: train_input_fn(asins2use_file= test_filename))
predictions = classifier.predict(input_fn=lambda: train_input_fn(features_df, word_ids))
asin_list, summary_id_list=[], []
ae_ids_list, input_ids_list= [], []
for i, pred_dict in enumerate(predictions):
pdb.set_trace()
if i==0:
pdb.set_trace()
print ("Processing {}".format(i))
if args.debug == True and i > 50:
if args.debug == True and i > 500:
break
elif i == 100:
elif i == 500:
break
asin_list.append(pred_dict['asin'])
asin_list.append(pred_dict['asin'][0].decode())
summary_id_list.append(pred_dict['summary_ids'].tolist())
ae_ids_list.append(pred_dict['ae_word_ids'].tolist())
input_ids_list.append(pred_dict['input_word_ids'].tolist())

pdb.set_trace()
tokenizer= params['tokenizer']
max_lens= [len(seq) if 0 not in seq else seq.index(0) for seq in summary_id_list]
summary_words_list= tokenizer.sequences_to_texts([ids[:max_lens[i]] for i, ids in enumerate(summary_id_list)])
summary_words_list= id_to_text(tokenizer, summary_id_list)
ae_words_list= [id_to_text(tokenizer, word_ids) for word_ids in ae_ids_list]
input_words_list= [id_to_text(tokenizer, word_ids) for word_ids in input_ids_list]

# out_dict= defaultdict(list)
ddict= defaultdict(list)
out_dict= OrderedDict()
for i, summary in enumerate(summary_words_list):
asin= asin_list[i].decode()
asin= asin_list[i]
out_dict[asin]= {}
out_dict[asin]['summary']= summary
# out_dict['asin'].append(asin_list[i].decode())
# out_dict['summary'].append(summary)
out_dict[asin]['ae_words_list']= ae_words_list[i]
out_dict[asin]['input_words_list']= input_words_list[i]
ddict['asin'].append(asin_list[i])
ddict['summary'].append(summary)
ddict['ae_words_list'].append(ae_words_list[i])
ddict['input_words_list'].append(input_words_list[i])

with open('results/abstractive_summaries.json', 'w') as fo:
json.dump(out_dict, fo, ensure_ascii=False, indent=2)

df= pd.DataFrame(ddict)
df.to_csv('results/abstractive_summaries.csv')


def safe_mkdir(directory):
if not os.path.exists(directory):
Expand All @@ -106,16 +126,17 @@ def run_model():
model_config= tf.estimator.RunConfig(model_dir=model_dir,
tf_random_seed=42,
log_step_count_steps=10,
save_checkpoints_steps=10,
save_checkpoints_steps=50,
keep_checkpoint_max=3)
classifier = tf.estimator.Estimator(
model_fn= my_model,
params= params,
config= model_config)

train_model(classifier, params, train_filename)
# train_model(classifier, params, train_filename)
pdb.set_trace()
# test_model(classifier, params, train_filename)
test_model(classifier, params, test_filename)


if args.debug == False:
Expand Down
201 changes: 125 additions & 76 deletions main_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,27 +144,7 @@ def hub_encoder(features, labels, mode, params):
assert False


def main_preprocess_embeddings2(kwargs):
pdb.set_trace()
db_file= os.path.join(config.DATA_PATH, "embedding_db-{}.s3db".format(args.encoder_name))
conn= sqlite3.connect(db_file)
cur= conn.cursor()
cur.execute("drop table if exists reviews_embeddings;")
cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
"asin VARCHAR(255) PRIMARY KEY NOT NULL, "
"product_embs VARCHAR(255),"
"product_sentences VARCHAR(255),"
"sentence_parent VARCHAR(255))")

classifier = tf.estimator.Estimator(
model_fn= my_model,
params= params,
config= model_config)



preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
encoder= get_encoder()
def get_df_from_db(kwargs):
df= pd.read_csv('df2use_train.csv', encoding='latin1')
df_filt= df[df.num_reviews<=100].reset_index(drop=True)
if kwargs["products"] == "three":
Expand All @@ -175,63 +155,42 @@ def main_preprocess_embeddings2(kwargs):
asin_list = asin_list[0:1000]
else:
raise Exception ("Product group not recognized")
# reviews_indexer= SQLLiteIndexer(config.DATA_PATH)
try:
reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
values_to_insert= []
ddict= defaultdict(list)
write= True
for i, row_tup in enumerate(reviews_iterator):
asin, product_reviews= row_tup
product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
# product_sentences, product_embs, sentence_parent= product_reviews, np.random.rand(len(product_reviews)*100, 500), np.random.randint(0,1000, 100*len(product_reviews))
# insert_emb(cur, asin, product_embs.tolist(), product_sentences, sentence_parent)
values_to_insert.append((str(asin), str(product_embs.tolist()), str(product_sentences), str(sentence_parent)))

# ddict['asin'].append(asin)
# ddict['product_embs'].append(product_embs.tolist())
# ddict['product_sentences'].append(product_sentences)
# ddict['sentence_parent'].append(sentence_parent)
logging.info(i)
gc.collect()
reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
ddict= defaultdict(list)
for i, row_tup in enumerate(reviews_iterator):
asin, product_reviews= row_tup
ddict['asin'].append(asin)
ddict['product_reviews'].append(product_reviews)
ret_df= pd.DataFrame(ddict)
return ret_df

if i > 0 and i % 50 == 0:
insert_emb_many(cur, values_to_insert)
# insert_emb_csv(cur, values_to_insert, write)
write= False
ddict= defaultdict(list)
logging.info("Inserted {} total products to embeddings_db".format(i+1))
values_to_insert= []

if i > 0 and i % 500 == 0:
conn.commit()
logging.info("Commited {} total products to embeddings_db".format(i+1))
cur.execute('select count(*) from reviews_embeddings')
logging.info(cur.fetchone())
gc.collect()
sys.stdout.flush()
pdb.set_trace()
insert_emb_many(cur, values_to_insert)
conn.commit()
logging.info("Finished {} total products to embeddings_db".format(i+1))
cur.execute('select count(*) from reviews_embeddings')
logging.info(cur.fetchone())
pdb.set_trace()
test_indexer= SQLLiteEmbeddingsIndexer(args.encoder_name)
ddict= test_indexer[asin]
assert ddict['asin'] == asin
assert ddict['product_sentences'] == product_sentences
assert ddict['sentence_parent'] == sentence_parent
np.testing.assert_equal(ddict['product_embs'], product_embs)
except KeyboardInterrupt:
logging.info("Keyboard interrup, commiting remaining")
conn.commit()
except Exception as e:
logging.info("Error type: {}".format(type(e).__name__))
conn.rollback()
finally:
cur.close()
conn.close()
def encode_reviews(reviews_df, encoder, kwargs):
preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
ddict= defaultdict(list)
for i, row_tup in enumerate(reviews_df.itertuples()):
asin, product_reviews= row_tup.asin, row_tup.product_reviews
product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
ddict['asin'].append(asin)
ddict['product_sentences'].append(product_sentences)
ddict['product_embs'].append(product_embs)
ddict['sentence_parent'].append(sentence_parent)
return ddict


def feed_embeddings_to_db(kwargs, ddict):
db_file= os.path.join(config.DATA_PATH, "embedding_db-{}_1.s3db".format(args.encoder_name))
conn= sqlite3.connect(db_file)
cur= conn.cursor()
cur.execute("drop table if exists reviews_embeddings;")
cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
"asin VARCHAR(255) PRIMARY KEY NOT NULL, "
"product_embs VARCHAR(255),"
"product_sentences VARCHAR(255),"
"sentence_parent VARCHAR(255))")





def main_preprocess_embeddings(kwargs):
Expand Down Expand Up @@ -406,3 +365,93 @@ def main(kwargs):
main_preprocess_embeddings(vars(args))
else:
main(vars(args))


# def main_preprocess_embeddings2(kwargs):
# pdb.set_trace()
# db_file= os.path.join(config.DATA_PATH, "embedding_db-{}.s3db".format(args.encoder_name))
# conn= sqlite3.connect(db_file)
# cur= conn.cursor()
# cur.execute("drop table if exists reviews_embeddings;")
# cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
# "asin VARCHAR(255) PRIMARY KEY NOT NULL, "
# "product_embs VARCHAR(255),"
# "product_sentences VARCHAR(255),"
# "sentence_parent VARCHAR(255))")

# classifier = tf.estimator.Estimator(
# model_fn= my_model,
# params= params,
# config= model_config)



# preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
# encoder= get_encoder()
# df= pd.read_csv('df2use_train.csv', encoding='latin1')
# df_filt= df[df.num_reviews<=100].reset_index(drop=True)
# if kwargs["products"] == "three":
# asin_list= ['B00008OE43', 'B0007OWASE', 'B000EI0EB8']
# elif kwargs["products"] == "all":
# asin_list= df_filt.asin.tolist()[:]
# # Cap at 1000 products
# asin_list = asin_list[0:1000]
# else:
# raise Exception ("Product group not recognized")
# # reviews_indexer= SQLLiteIndexer(config.DATA_PATH)
# try:
# reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
# values_to_insert= []
# ddict= defaultdict(list)
# write= True
# for i, row_tup in enumerate(reviews_iterator):
# asin, product_reviews= row_tup
# product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
# # product_sentences, product_embs, sentence_parent= product_reviews, np.random.rand(len(product_reviews)*100, 500), np.random.randint(0,1000, 100*len(product_reviews))
# # insert_emb(cur, asin, product_embs.tolist(), product_sentences, sentence_parent)
# values_to_insert.append((str(asin), str(product_embs.tolist()), str(product_sentences), str(sentence_parent)))

# # ddict['asin'].append(asin)
# # ddict['product_embs'].append(product_embs.tolist())
# # ddict['product_sentences'].append(product_sentences)
# # ddict['sentence_parent'].append(sentence_parent)
# logging.info(i)
# gc.collect()

# if i > 0 and i % 50 == 0:
# insert_emb_many(cur, values_to_insert)
# # insert_emb_csv(cur, values_to_insert, write)
# write= False
# ddict= defaultdict(list)
# logging.info("Inserted {} total products to embeddings_db".format(i+1))
# values_to_insert= []

# if i > 0 and i % 500 == 0:
# conn.commit()
# logging.info("Commited {} total products to embeddings_db".format(i+1))
# cur.execute('select count(*) from reviews_embeddings')
# logging.info(cur.fetchone())
# gc.collect()
# sys.stdout.flush()
# pdb.set_trace()
# insert_emb_many(cur, values_to_insert)
# conn.commit()
# logging.info("Finished {} total products to embeddings_db".format(i+1))
# cur.execute('select count(*) from reviews_embeddings')
# logging.info(cur.fetchone())
# pdb.set_trace()
# test_indexer= SQLLiteEmbeddingsIndexer(args.encoder_name)
# ddict= test_indexer[asin]
# assert ddict['asin'] == asin
# assert ddict['product_sentences'] == product_sentences
# assert ddict['sentence_parent'] == sentence_parent
# np.testing.assert_equal(ddict['product_embs'], product_embs)
# except KeyboardInterrupt:
# logging.info("Keyboard interrup, commiting remaining")
# conn.commit()
# except Exception as e:
# logging.info("Error type: {}".format(type(e).__name__))
# conn.rollback()
# finally:
# cur.close()
# conn.close()
16 changes: 10 additions & 6 deletions model_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
OOV_TOKEN= '<OOV>'
BATCH_SIZE= 16
BATCH_SIZE= args.num_products_per_batch #number of products per batch
NUM_EPOCHS= 5
NUM_REVIEWS_K= args.abs_num_reviews
NUM_REVIEWS_K= args.abs_num_reviews #number of reviews per product


def db_cur_gen(cur):
Expand Down Expand Up @@ -114,8 +114,8 @@ def tf_data_gen(K= NUM_REVIEWS_K, maxlen= MAX_SEQUENCE_LENGTH):
return dataset


def train_input_fn(data_path= DATA_PATH, db_name= "reviews.s3db", asins2use_file= "abs_train_set_8.csv"):

def prepare_df(data_path= DATA_PATH, db_name= "reviews.s3db", asins2use_file= "abs_train_set_8.csv"):
with open('cache/tokenizer.pkl', 'rb') as fi:
tokenizer= pickle.load(fi)
review_iterator= TFReviewIterator(data_path, db_name, asins2use_file)
Expand Down Expand Up @@ -146,11 +146,15 @@ def tf_data_df(K= NUM_REVIEWS_K, maxlen= MAX_SEQUENCE_LENGTH):
ret_df= pd.DataFrame(ddict)
word_ids= np.vstack(data_batch_list)
return ret_df, word_ids

# pdb.set_trace()

features_df, word_ids= tf_data_df()
print("Features Dataframe shape: {}".format(features_df.shape))
print("Word IDs data batch Dataframe shape: {}".format(word_ids.shape))
return features_df, word_ids


def train_input_fn(features_df, word_ids):
pdb.set_trace()
features= dict(features_df)
features['data_batch']= word_ids
# pdb.set_trace()
Expand Down
Loading

0 comments on commit 63615a6

Please sign in to comment.