working ABS code; TODO: insert EOS token for periods

thisisjeffchen · Dec 10, 2018 · 63615a6 · 63615a6
1 parent 11926bb
commit 63615a6
Show file tree

Hide file tree

Showing 10 changed files with 114,160 additions and 99,755 deletions.
diff --git a/abs_test_set_8.csv b/abs_test_set_8.csv
diff --git a/abs_test_set_8_all.csv b/abs_test_set_8_all.csv
diff --git a/abs_train_set_8.csv b/abs_train_set_8.csv
diff --git a/abs_train_set_8_all.csv b/abs_train_set_8_all.csv
diff --git a/config.py b/config.py
@@ -82,7 +82,7 @@
                     help='whether to initialize from a pretrained embedding matrix')
 
 parser.add_argument('--cold_start', type=ast.literal_eval, 
-                    default=True,
+                    default=False,
                     help='whether to overwrite existing model_dir')
 
 parser.add_argument('--prepare_embeddings', type=ast.literal_eval, 
@@ -93,6 +93,9 @@
                     default=True,
                     help='whether embeddings have been preprocessed and dont need to be extracted again for all reviews')
 
+parser.add_argument('--num_products_per_batch', type=int, 
+                    default=32,
+                    help='the number of products per batch for ABS; actual batch size will be num_products_per_batch*abs_num_reviews')
 
 args = parser.parse_args()
 

diff --git a/main_abs.py b/main_abs.py
@@ -13,10 +13,10 @@
 import dill as pickle
 import json
 
-from model_data import train_input_fn
+from model_data import train_input_fn, prepare_df
 from modules import my_model
 from config import args
-os.environ['CUDA_VISIBLE_DEVICES']= "-1"
+os.environ['CUDA_VISIBLE_DEVICES']= "0"
 
 def make_config():
     with open('cache/tokenizer.pkl', 'rb') as fi:
@@ -44,48 +44,68 @@ def make_config():
     return params
 
 
+def id_to_text(tokenizer, id_list):
+    max_lens= [len(seq) if 0 not in seq else seq.index(0) for seq in id_list]
+    words_list= tokenizer.sequences_to_texts([ids[:max_lens[i]] for i, ids in enumerate(id_list)])
+    return words_list
+
+
 def train_model(classifier, params, train_filename):
+    features_df, word_ids= prepare_df(asins2use_file= train_filename)
     # Train the Model.
     if args.debug == True:
         maxsteps= None
     else:
         maxsteps= None
     classifier.train(
-        input_fn=lambda: train_input_fn(asins2use_file= train_filename),
+        input_fn=lambda: train_input_fn(features_df, word_ids),
         steps=maxsteps)
 
 
 def test_model(classifier, params, test_filename):
+    features_df, word_ids= prepare_df(asins2use_file= test_filename)
     # Test the Model.
-    predictions = classifier.predict(input_fn=lambda: train_input_fn(asins2use_file= test_filename))
+    predictions = classifier.predict(input_fn=lambda: train_input_fn(features_df, word_ids))
     asin_list, summary_id_list=[], []
+    ae_ids_list, input_ids_list= [], []
     for i, pred_dict in enumerate(predictions):
-        pdb.set_trace()
+        if i==0:
+            pdb.set_trace()
         print ("Processing {}".format(i))
-        if args.debug == True and i > 50:
+        if args.debug == True and i > 500:
             break
-        elif i == 100:
+        elif i == 500:
             break
-        asin_list.append(pred_dict['asin'])
+        asin_list.append(pred_dict['asin'][0].decode())
         summary_id_list.append(pred_dict['summary_ids'].tolist())
+        ae_ids_list.append(pred_dict['ae_word_ids'].tolist())
+        input_ids_list.append(pred_dict['input_word_ids'].tolist())
 
     pdb.set_trace()
     tokenizer= params['tokenizer']
-    max_lens= [len(seq) if 0 not in seq else seq.index(0) for seq in summary_id_list]
-    summary_words_list= tokenizer.sequences_to_texts([ids[:max_lens[i]] for i, ids in enumerate(summary_id_list)])
+    summary_words_list= id_to_text(tokenizer, summary_id_list)
+    ae_words_list= [id_to_text(tokenizer, word_ids) for word_ids in ae_ids_list]
+    input_words_list= [id_to_text(tokenizer, word_ids) for word_ids in input_ids_list]
 
-    # out_dict= defaultdict(list)
+    ddict= defaultdict(list)
     out_dict= OrderedDict()
     for i, summary in enumerate(summary_words_list):
-        asin= asin_list[i].decode()
+        asin= asin_list[i]
         out_dict[asin]= {}
         out_dict[asin]['summary']= summary
-        # out_dict['asin'].append(asin_list[i].decode())
-        # out_dict['summary'].append(summary)
+        out_dict[asin]['ae_words_list']= ae_words_list[i]
+        out_dict[asin]['input_words_list']= input_words_list[i]
+        ddict['asin'].append(asin_list[i])
+        ddict['summary'].append(summary)
+        ddict['ae_words_list'].append(ae_words_list[i])
+        ddict['input_words_list'].append(input_words_list[i])
 
     with open('results/abstractive_summaries.json', 'w') as fo:
         json.dump(out_dict, fo, ensure_ascii=False, indent=2)
 
+    df= pd.DataFrame(ddict)
+    df.to_csv('results/abstractive_summaries.csv')
+
 
 def safe_mkdir(directory):
     if not os.path.exists(directory):
@@ -106,16 +126,17 @@ def run_model():
     model_config= tf.estimator.RunConfig(model_dir=model_dir,
                                         tf_random_seed=42,
                                         log_step_count_steps=10,
-                                        save_checkpoints_steps=10,
+                                        save_checkpoints_steps=50,
                                         keep_checkpoint_max=3)
     classifier = tf.estimator.Estimator(
         model_fn= my_model,
         params= params,
         config= model_config)
 
-    train_model(classifier, params, train_filename)
+    # train_model(classifier, params, train_filename)
     pdb.set_trace()
     # test_model(classifier, params, train_filename)
+    test_model(classifier, params, test_filename)
 
 
 if args.debug == False:

diff --git a/main_cluster.py b/main_cluster.py
@@ -144,27 +144,7 @@ def hub_encoder(features, labels, mode, params):
     assert False
 
 
-def main_preprocess_embeddings2(kwargs):
-    pdb.set_trace()
-    db_file= os.path.join(config.DATA_PATH, "embedding_db-{}.s3db".format(args.encoder_name))
-    conn= sqlite3.connect(db_file)
-    cur= conn.cursor()
-    cur.execute("drop table if exists reviews_embeddings;")
-    cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
-      "asin VARCHAR(255) PRIMARY KEY NOT NULL, "
-      "product_embs VARCHAR(255),"
-      "product_sentences VARCHAR(255),"
-      "sentence_parent VARCHAR(255))")
-
-    classifier = tf.estimator.Estimator(
-        model_fn= my_model,
-        params= params,
-        config= model_config)
-
-
-
-    preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
-    encoder= get_encoder()
+def get_df_from_db(kwargs):
     df= pd.read_csv('df2use_train.csv', encoding='latin1')
     df_filt= df[df.num_reviews<=100].reset_index(drop=True)
     if kwargs["products"] == "three":
@@ -175,63 +155,42 @@ def main_preprocess_embeddings2(kwargs):
         asin_list = asin_list[0:1000]
     else:
         raise Exception ("Product group not recognized")
-    # reviews_indexer= SQLLiteIndexer(config.DATA_PATH)
-    try:
-        reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
-        values_to_insert= []
-        ddict= defaultdict(list)
-        write= True
-        for i, row_tup in enumerate(reviews_iterator):
-            asin, product_reviews= row_tup
-            product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
-            # product_sentences, product_embs, sentence_parent= product_reviews, np.random.rand(len(product_reviews)*100, 500), np.random.randint(0,1000, 100*len(product_reviews))
-            # insert_emb(cur, asin, product_embs.tolist(), product_sentences, sentence_parent)
-            values_to_insert.append((str(asin), str(product_embs.tolist()), str(product_sentences), str(sentence_parent)))
-
-            # ddict['asin'].append(asin)
-            # ddict['product_embs'].append(product_embs.tolist())
-            # ddict['product_sentences'].append(product_sentences)
-            # ddict['sentence_parent'].append(sentence_parent)
-            logging.info(i)
-            gc.collect()
+    reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
+    ddict= defaultdict(list)
+    for i, row_tup in enumerate(reviews_iterator):
+        asin, product_reviews= row_tup
+        ddict['asin'].append(asin)
+        ddict['product_reviews'].append(product_reviews)
+    ret_df= pd.DataFrame(ddict)
+    return ret_df
 
-            if i > 0 and i % 50 == 0:
-                insert_emb_many(cur, values_to_insert)
-                # insert_emb_csv(cur, values_to_insert, write)
-                write= False
-                ddict= defaultdict(list)
-                logging.info("Inserted {} total products to embeddings_db".format(i+1))
-                values_to_insert= []
 
-            if i > 0 and i % 500 == 0:
-                conn.commit()
-                logging.info("Commited {} total products to embeddings_db".format(i+1))
-                cur.execute('select count(*) from reviews_embeddings')
-                logging.info(cur.fetchone())
-                gc.collect()
-            sys.stdout.flush()
-        pdb.set_trace()
-        insert_emb_many(cur, values_to_insert)
-        conn.commit()
-        logging.info("Finished {} total products to embeddings_db".format(i+1))
-        cur.execute('select count(*) from reviews_embeddings')
-        logging.info(cur.fetchone())
-        pdb.set_trace()
-        test_indexer= SQLLiteEmbeddingsIndexer(args.encoder_name)
-        ddict= test_indexer[asin]
-        assert ddict['asin'] == asin
-        assert ddict['product_sentences'] == product_sentences
-        assert ddict['sentence_parent'] == sentence_parent
-        np.testing.assert_equal(ddict['product_embs'], product_embs)
-    except KeyboardInterrupt:
-        logging.info("Keyboard interrup, commiting remaining")
-        conn.commit()
-    except Exception as e:
-        logging.info("Error type: {}".format(type(e).__name__))
-        conn.rollback()
-    finally:
-        cur.close()
-        conn.close()
+def encode_reviews(reviews_df, encoder, kwargs):
+    preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
+    ddict= defaultdict(list)
+    for i, row_tup in enumerate(reviews_df.itertuples()):
+        asin, product_reviews= row_tup.asin, row_tup.product_reviews
+        product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
+        ddict['asin'].append(asin)
+        ddict['product_sentences'].append(product_sentences)
+        ddict['product_embs'].append(product_embs)
+        ddict['sentence_parent'].append(sentence_parent)
+    return ddict
+
+
+def feed_embeddings_to_db(kwargs, ddict):
+    db_file= os.path.join(config.DATA_PATH, "embedding_db-{}_1.s3db".format(args.encoder_name))
+    conn= sqlite3.connect(db_file)
+    cur= conn.cursor()
+    cur.execute("drop table if exists reviews_embeddings;")
+    cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
+      "asin VARCHAR(255) PRIMARY KEY NOT NULL, "
+      "product_embs VARCHAR(255),"
+      "product_sentences VARCHAR(255),"
+      "sentence_parent VARCHAR(255))")
+
+
+
 
 
 def main_preprocess_embeddings(kwargs):
@@ -406,3 +365,93 @@ def main(kwargs):
            main_preprocess_embeddings(vars(args))
        else:
            main(vars(args))
+
+
+# def main_preprocess_embeddings2(kwargs):
+#     pdb.set_trace()
+#     db_file= os.path.join(config.DATA_PATH, "embedding_db-{}.s3db".format(args.encoder_name))
+#     conn= sqlite3.connect(db_file)
+#     cur= conn.cursor()
+#     cur.execute("drop table if exists reviews_embeddings;")
+#     cur.execute("CREATE TABLE IF NOT EXISTS reviews_embeddings ("
+#       "asin VARCHAR(255) PRIMARY KEY NOT NULL, "
+#       "product_embs VARCHAR(255),"
+#       "product_sentences VARCHAR(255),"
+#       "sentence_parent VARCHAR(255))")
+
+#     classifier = tf.estimator.Estimator(
+#         model_fn= my_model,
+#         params= params,
+#         config= model_config)
+
+
+
+#     preprocess_module= PreprocessEncoder(kwargs['summary_length'], embeddings_preprocessed=False)
+#     encoder= get_encoder()
+#     df= pd.read_csv('df2use_train.csv', encoding='latin1')
+#     df_filt= df[df.num_reviews<=100].reset_index(drop=True)
+#     if kwargs["products"] == "three":
+#         asin_list= ['B00008OE43', 'B0007OWASE', 'B000EI0EB8']
+#     elif kwargs["products"] == "all":
+#         asin_list= df_filt.asin.tolist()[:]
+#         # Cap at 1000 products
+#         asin_list = asin_list[0:1000]
+#     else:
+#         raise Exception ("Product group not recognized")
+#     # reviews_indexer= SQLLiteIndexer(config.DATA_PATH)
+#     try:
+#         reviews_iterator= SQLLiteAsinAttrIterator(asin_list)
+#         values_to_insert= []
+#         ddict= defaultdict(list)
+#         write= True
+#         for i, row_tup in enumerate(reviews_iterator):
+#             asin, product_reviews= row_tup
+#             product_sentences, product_embs, sentence_parent= preprocess_module(asin, product_reviews, encoder)
+#             # product_sentences, product_embs, sentence_parent= product_reviews, np.random.rand(len(product_reviews)*100, 500), np.random.randint(0,1000, 100*len(product_reviews))
+#             # insert_emb(cur, asin, product_embs.tolist(), product_sentences, sentence_parent)
+#             values_to_insert.append((str(asin), str(product_embs.tolist()), str(product_sentences), str(sentence_parent)))
+
+#             # ddict['asin'].append(asin)
+#             # ddict['product_embs'].append(product_embs.tolist())
+#             # ddict['product_sentences'].append(product_sentences)
+#             # ddict['sentence_parent'].append(sentence_parent)
+#             logging.info(i)
+#             gc.collect()
+
+#             if i > 0 and i % 50 == 0:
+#                 insert_emb_many(cur, values_to_insert)
+#                 # insert_emb_csv(cur, values_to_insert, write)
+#                 write= False
+#                 ddict= defaultdict(list)
+#                 logging.info("Inserted {} total products to embeddings_db".format(i+1))
+#                 values_to_insert= []
+
+#             if i > 0 and i % 500 == 0:
+#                 conn.commit()
+#                 logging.info("Commited {} total products to embeddings_db".format(i+1))
+#                 cur.execute('select count(*) from reviews_embeddings')
+#                 logging.info(cur.fetchone())
+#                 gc.collect()
+#             sys.stdout.flush()
+#         pdb.set_trace()
+#         insert_emb_many(cur, values_to_insert)
+#         conn.commit()
+#         logging.info("Finished {} total products to embeddings_db".format(i+1))
+#         cur.execute('select count(*) from reviews_embeddings')
+#         logging.info(cur.fetchone())
+#         pdb.set_trace()
+#         test_indexer= SQLLiteEmbeddingsIndexer(args.encoder_name)
+#         ddict= test_indexer[asin]
+#         assert ddict['asin'] == asin
+#         assert ddict['product_sentences'] == product_sentences
+#         assert ddict['sentence_parent'] == sentence_parent
+#         np.testing.assert_equal(ddict['product_embs'], product_embs)
+#     except KeyboardInterrupt:
+#         logging.info("Keyboard interrup, commiting remaining")
+#         conn.commit()
+#     except Exception as e:
+#         logging.info("Error type: {}".format(type(e).__name__))
+#         conn.rollback()
+#     finally:
+#         cur.close()
+#         conn.close()
diff --git a/model_data.py b/model_data.py
@@ -30,9 +30,9 @@
 MAX_NUM_WORDS = 20000
 EMBEDDING_DIM = 300
 OOV_TOKEN= '<OOV>'
-BATCH_SIZE= 16
+BATCH_SIZE= args.num_products_per_batch #number of products per batch
 NUM_EPOCHS= 5
-NUM_REVIEWS_K= args.abs_num_reviews
+NUM_REVIEWS_K= args.abs_num_reviews #number of reviews per product
 
 
 def db_cur_gen(cur):
@@ -114,8 +114,8 @@ def tf_data_gen(K= NUM_REVIEWS_K, maxlen= MAX_SEQUENCE_LENGTH):
     return dataset
 
 
-def train_input_fn(data_path= DATA_PATH, db_name= "reviews.s3db", asins2use_file= "abs_train_set_8.csv"):
-    
+
+def prepare_df(data_path= DATA_PATH, db_name= "reviews.s3db", asins2use_file= "abs_train_set_8.csv"):
     with open('cache/tokenizer.pkl', 'rb') as fi:
         tokenizer= pickle.load(fi)
     review_iterator= TFReviewIterator(data_path, db_name, asins2use_file)
@@ -146,11 +146,15 @@ def tf_data_df(K= NUM_REVIEWS_K, maxlen= MAX_SEQUENCE_LENGTH):
         ret_df= pd.DataFrame(ddict)
         word_ids= np.vstack(data_batch_list)
         return ret_df, word_ids
-
-    # pdb.set_trace()
+
     features_df, word_ids= tf_data_df()
     print("Features Dataframe shape: {}".format(features_df.shape))
     print("Word IDs data batch Dataframe shape: {}".format(word_ids.shape))
+    return features_df, word_ids
+
+
+def train_input_fn(features_df, word_ids):       
+    pdb.set_trace()
     features= dict(features_df)
     features['data_batch']= word_ids
     # pdb.set_trace()