In [18]:
import config_etienne as cf
import sys
import codecs
import random
import time
import imp
import os
import tensorflow.compat.v1 as tf
import numpy as np
import gensim.models as g


In [2]:
from sonnet_model import SonnetModel
from RUN_epoch import run_epoch

In [3]:
from util import *

In [4]:
#constants
pad_symbol = "<pad>"
end_symbol = "<eos>"
unk_symbol = "<unk>"
dummy_symbols = [pad_symbol, end_symbol, unk_symbol]

#globals
wordxid = None
idxword = None
charxid = None
idxchar = None
wordxchar = None #word id to [char ids]
rhyme_thresholds = [0.9, 0.8, 0.7, 0.6]
stress_acc_threshold = 0.4
reset_scale = 1.05


sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

#set the seeds
random.seed(cf.seed)
np.random.seed(cf.seed)

In [5]:
#load word embedding model if given and set word embedding size
if cf.word_embedding_model:
    #print("\nLoading word embedding model...")
    mword = g.Word2Vec.load(cf.word_embedding_model)
    cf.word_embedding_dim= mword.vector_size

In [6]:
#load vocab
print("\n", "First pass to collect word and character vocabulary...")
idxword, wordxid, idxchar, charxid, wordxchar = load_vocab(cf.train_data, cf.word_minfreq, dummy_symbols)
print("\nWord type size =", len(idxword))
print("\nChar type size =", len(idxchar))

#load train and valid data
print("\n Loading train and valid data...")
train_word_data, train_char_data, train_nwords, train_nchars, train_rhyme_data = \
    load_data(cf.train_data, wordxid, idxword, charxid, idxchar, dummy_symbols)
# a rajouter


valid_word_data, valid_char_data, valid_rhyme_data, valid_nwords, valid_nchars = \
    load_data(cf.valid_data, wordxid, idxword, charxid, idxchar, dummy_symbols)
print_stats("\nTrain", train_word_data, train_nwords, train_nchars, train_rhyme_data)
print_stats("\nValid", valid_word_data, valid_rhyme_data, valid_nwords, valid_nchars)

#load test data if it's given
if cf.test_data:
    test_word_data, test_char_data, test_rhyme_data, test_nwords, test_nchars = \
        load_data(cf.test_data, wordxid, idxword, charxid, idxchar, dummy_symbols)
    print_stats("\nTest", test_word_data, test_rhyme_data, test_nwords, test_nchars)



 First pass to collect word and character vocabulary...

Word type size = 8398

Char type size = 77

 Loading train and valid data...

Train statistics:
  Number of documents         = 2685
  Number of rhyme examples    = 32220
  Total number of word tokens = 367281
  Mean/min/max words per line = 9.77/5/16
  Total number of char tokens = 1552659
  Mean/min/max chars per line = 41.31/25/59

Valid statistics:
  Number of documents         = 335
  Number of rhyme examples    = 4020
  Total number of word tokens = 45922
  Mean/min/max words per line = 9.79/6/16
  Total number of char tokens = 193929
  Mean/min/max chars per line = 41.35/28/58

Test statistics:
  Number of documents         = 335
  Number of rhyme examples    = 4020
  Total number of word tokens = 45904
  Mean/min/max words per line = 9.79/6/15
  Total number of char tokens = 193851
  Mean/min/max chars per line = 41.33/24/57


# Appliquons notre word2vec à ça 

In [7]:
if cf.word_embedding_model:
            word_emb = init_embedding(mword, idxword)

In [8]:
word_emb

array([[ 4.88135039e-04,  2.15189366e-03,  1.02763376e-03, ...,
        -4.79892454e-03,  3.28940029e-03, -4.95304524e-03],
       [ 6.85335398e-02, -1.17467120e-01,  8.59429240e-02, ...,
         8.32471997e-02, -4.60354239e-02,  9.11140069e-02],
       [ 1.77816537e-03, -2.29992027e-03,  2.35194022e-03, ...,
        -2.45643518e-03, -4.41970840e-03, -6.55833744e-04],
       ...,
       [-4.57953304e-01, -3.02415699e-01,  8.37981403e-02, ...,
        -1.64184749e-01,  5.43451726e-01, -7.18330026e-01],
       [-4.62094545e-01, -6.43977761e-01, -1.61367968e-01, ...,
        -2.93693751e-01,  1.80092514e-01, -1.05382828e-02],
       [ 3.86683221e-03,  3.30908798e-03, -4.68394557e-03, ...,
         2.24330979e-03, -3.08428987e-03,  1.62174234e-03]])

In [9]:
word_emb.shape

(8398, 100)

voir les mots les plus proches

In [10]:
word_emb_dict = dict(zip(idxword, word_emb))

In [11]:
from collections import OrderedDict

def closest_word(word):
    
    dico = {}
    
    def cosinus_similarity(word1, word2):
        score =  np.dot(word_emb_dict[word1], word_emb_dict[word2])/(np.linalg.norm(word_emb_dict [word1])* \
                                                                         np.linalg.norm(word_emb_dict [word2]))
        return {word2 : score}
    
    
    for ele in word_emb_dict.keys():
        dico.update(cosinus_similarity(word, ele))
    
    sorted_dico = sorted(dico.items(), key=lambda kv: kv[1], reverse=True)
        
    return OrderedDict(sorted_dico)

In [12]:
closest_word('never')

OrderedDict([('never', 1.0000000000000002),
             ('but', 0.8872969579076875),
             ('be', 0.8605657911692786),
             ('ever', 0.8586669032095533),
             ('only', 0.855969927570251),
             ('i', 0.8529641917978267),
             ('have', 0.8514867820190137),
             ('though', 0.8422111132862984),
             ('if', 0.8349231537072904),
             ('no', 0.8319912872366985),
             ('would', 0.8272255179027386),
             ('for', 0.8257566329991046),
             ('not', 0.8251743488821022),
             ('it', 0.8191344258633185),
             ('that', 0.8131676557411432),
             ('how', 0.8095799999441514),
             ('can', 0.8029688803312166),
             ('think', 0.7972848102320682),
             ('yet', 0.7958998979159211),
             ('so', 0.79547931998751),
             ('should', 0.7886724855119821),
             ('to', 0.7875519964498003),
             ('could', 0.7862320806658817),
             ('more', 0.780

## maintenant faut faire un language model

In [13]:
create_word_batch(data=train_word_data, batch_size=32, lines_per_doc=14, nlines_per_batch=2, pad_symbol=pad_symbol,\
                  end_symbol=end_symbol, unk_symbol=unk_symbol, shuffle_data=True)[0]


#ça ça marche

([['<eos>',
   308,
   7,
   60,
   25,
   447,
   19,
   3,
   1044,
   7,
   4090,
   5,
   1,
   18,
   298,
   983,
   3,
   774,
   13,
   6549,
   30,
   '<pad>',
   '<pad>',
   '<pad>'],
  ['<eos>',
   215,
   206,
   172,
   11,
   37,
   2428,
   172,
   206,
   11,
   1,
   46,
   54,
   3,
   35,
   13,
   3,
   694,
   5,
   112,
   3,
   1262,
   30,
   '<pad>'],
  ['<eos>',
   18,
   183,
   7,
   165,
   72,
   440,
   16,
   637,
   5,
   1,
   308,
   451,
   14,
   555,
   4,
   27,
   128,
   3,
   711,
   8,
   '<pad>',
   '<pad>',
   '<pad>'],
  ['<eos>',
   3126,
   203,
   1228,
   493,
   2,
   6,
   2,
   9,
   1,
   6006,
   120,
   8,
   2,
   4,
   242,
   11,
   3,
   186,
   5,
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>'],
  ['<eos>',
   176,
   18,
   54,
   71,
   509,
   7,
   7643,
   4,
   1,
   124,
   6260,
   40,
   82,
   75,
   59,
   89,
   201,
   5,
   1750,
   392,
   11,
   '<pad>',
   '<pad>'],
  ['<eos>',
   3642,
   66,
   28,
   6129,

## Le modèle

In [14]:
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

In [15]:
with tf.Graph().as_default(), tf.Session() as sess:

        tf.set_random_seed(cf.seed)

        with tf.variable_scope("model", reuse=None):
            mtrain = SonnetModel(True, cf.batch_size, len(idxword), len(idxchar),
                charxid[" "], charxid[pad_symbol], cf)
        with tf.variable_scope("model", reuse=True):
            mvalid = SonnetModel(False, cf.batch_size, len(idxword), len(idxchar),
                charxid[" "], charxid[pad_symbol], cf)
        with tf.variable_scope("model", reuse=True):
            mgen = SonnetModel(False, 1, len(idxword), len(idxchar), charxid[" "], charxid[pad_symbol], cf)

        tf.global_variables_initializer().run()

        #initialise word embedding
        if cf.word_embedding_model:
            word_emb = init_embedding(mword, idxword)
            sess.run(mtrain.word_embedding.assign(word_emb))
            

        if cf.save_model:
            if not os.path.exists(cf.output_dir):
                os.makedirs(cf.output_dir)
            #create saver object to save model
            saver = tf.compat.v1.train.Saver(max_to_keep=0)
            

        #train model
        prev_lm_loss = None 
        
        for i in range(cf.epoch_size):

            print("\nEpoch =", i+1)

            #create batches for language model
            train_word_batch = create_word_batch(train_word_data, cf.batch_size,
                cf.doc_lines, cf.bptt_truncate, wordxid[pad_symbol], wordxid[end_symbol], wordxid[unk_symbol], True)
            
            valid_word_batch = create_word_batch(valid_word_data, cf.batch_size,
                cf.doc_lines, cf.bptt_truncate, wordxid[pad_symbol], wordxid[end_symbol], wordxid[unk_symbol], False)

            #train an epoch
            _ = run_epoch(sess, train_word_batch, mtrain, "TRAIN", True)
            lm_loss = run_epoch(sess, valid_word_batch, mvalid, "VALID", False)

            #create batch for test model and run an epoch if it's given
            if cf.test_data:
                test_word_batch = create_word_batch(test_word_data, cf.batch_size,
                    cf.doc_lines, cf.bptt_truncate, wordxid[pad_symbol], wordxid[end_symbol], wordxid[unk_symbol], False)
                run_epoch(sess, test_word_batch, mvalid, "TEST", False)
                
          
            #We save
            if cf.save_model:
                    if prev_lm_loss == None  or lm_loss <= prev_lm_loss : #or not train_lm :
                        saver.save(sess, os.path.join(cf.output_dir, "model.ckpt"))
                        prev_lm_loss = lm_loss

                    else:
                        saver.restore(sess, os.path.join(cf.output_dir, "model.ckpt"))
                        print("New valid performance is worse; restoring previous parameters...")
                        print("  lm loss: %.5f --> %.5f" % (prev_lm_loss, lm_loss))
                
            


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


AttributeError: module 'tensorflow_core._api.v2.nn' has no attribute 'dynamic_rnn'

## Génération de poèmes mtn

In [16]:
import os
import sys
import random
import codecs
import numpy as np
import tensorflow as tf
from collections import namedtuple
from sonnet_model import SonnetModel
from nltk.corpus import stopwords as nltk_stopwords
from util import *


#constants
seed =  2
num_samples = 1
#save_pickle = os.path(output_dir)
temp_min = 0.6
temp_max = 0.8
sent_sample = 10
verbose = False
pad_symbol = "<pad>"
end_symbol = "<eos>"
unk_symbol = "<unk>"
dummy_symbols = [pad_symbol, end_symbol, unk_symbol]
custom_stopwords = [ "thee", "thou", "thy", "'d", "'s", "'ll", "must", "shall" ]

###########
#functions#
###########

def reverse_dic(idxvocab):
    vocabxid = {}
    for vi, v in enumerate(idxvocab):
        vocabxid[v] = vi

    return vocabxid

######
#main#
######

def main():

    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

    #set the seeds
    random.seed(seed)
    np.random.seed(seed)


    #symbols to avoid for generation
    avoid_symbols = ["(", ")", "“", "‘", "”", "’", "[", "]"]
    avoid_symbols = [ wordxid[item] for item in avoid_symbols ]
    stopwords = set([ wordxid[item] for item in (nltk_stopwords.words("english") + custom_stopwords) if item in wordxid ])

    quatrains = []
    #initialise and load model parameters
    with tf.Graph().as_default(), tf.Session() as sess:
        tf.set_random_seed(seed)

        with tf.variable_scope("model", reuse=None):
            mtest = SonnetModel(False, cf.batch_size, len(idxword), len(idxchar), charxid[" "], charxid[pad_symbol], cf)

        with tf.variable_scope("model", reuse=True):
            mgen = SonnetModel(False, 1, len(idxword), len(idxchar), charxid[" "], charxid[pad_symbol], cf)

        #load tensorflow model
        saver = tf.train.Saver()
        saver.restore(sess, os.path.join(cf.output_dir, "model.ckpt"))

        #quatrain generation
        for _ in range(num_samples):

            #generate some random sentences
            #print("\nTemperature =", temp_min, "-", temp_max)
            
            
            q, probs = mgen.generate(sess, idxword, wordxid[pad_symbol],
                wordxid[end_symbol], wordxid[unk_symbol], charxid[" "], avoid_symbols, stopwords,
                temp_min, temp_max, 12, 400, sent_sample, verbose)
                
            
            quatrains.append(q)
            
        return quatrains


In [19]:
main()

AttributeError: module 'tensorflow_core._api.v2.nn' has no attribute 'dynamic_rnn'