In [1]:
import os
import numpy as np
import re
import shutil
import tensorflow as tf
DATA_DIR = './data'
CHEKPOINT_DIR = os.path.join(DATA_DIR,"chekpoints")


In [3]:
def download_and_read(urls):
    texts = []
    for i, url in enumerate(urls):
        p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url,cache_dir=".")
        text = open(p,"r", encoding="utf8").read()
        text = text.replace("\ufeff","")
        text = text.replace("\n","")
        text = re.sub(r'\s+',"",text)
        texts.extend(text)
        
    return texts
texts = download_and_read([
    "http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
    "https://www.gutenberg.org/files/12/12-0.txt"
])
        

Downloading data from https://www.gutenberg.org/files/12/12-0.txt


In [5]:
vocab = sorted(set(texts))
print("vocab size: {:d}".format(len(vocab)))

char2idx = {c:i for i,c in enumerate(vocab)}
idx2char = {i:c for c,i in char2idx.items()}

vocab size: 92


In [8]:
texts_as_ints = np.array([char2idx[c] for c in texts], )
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

In [14]:
seq_length = 100
sequences = data.batch(seq_length+1,drop_remainder=True)
def split_train_labels(sequence,):
    input_seq = sequence[0:-1]
    output_seq = sequence[1:]
    return input_seq,output_seq

sequences = sequences.map(split_train_labels)

batch_size = 64
steps_per_epoch = len(texts) // seq_length // batch_size

dataset = sequences.shuffle(10000).batch(batch_size, drop_remainder=True)
    

In [15]:
sequences

<MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int32, name=None), TensorSpec(shape=(100,), dtype=tf.int32, name=None))>

In [16]:
class CharGenModel(tf.keras.Model):
    def __init__(self,vocab_size,num_timesteps,embedding_dim,**kwargs):
        super(CharGenModel, self).__init__(**kwargs)
        self.embedding_layer = tf.keras.layers.Embedding(
            vocab_size, embedding_dim
        )
        self.rnn_layer = tf.keras.layers.GRU(
            num_timesteps,
            recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid",
            stateful=True,
            return_sequences=True
        )
        self.dense_layer = tf.keras.layers.Dense(vocab_size)
    
    def call(self,x):
        x = self.embedding_layer(x)
        x = self.rnn_layer(x)
        x = self.dense_layer(x)
        return x
    
vocab_size = len(vocab)
embedding_dim = 256

model = CharGenModel(vocab_size,seq_length,embedding_dim)
model.build(input_shape=(batch_size,seq_length))

def loss(labels,predictions):
    return tf.losses.sparse_categorical_crossentropy(
        labels,
        predictions,from_logits=True
    )

model.compile(optimizer=tf.optimizers.Adam(),loss=loss)

In [17]:
def generate_text(model,prefix_string,char2idx,idx2char,num_chars_to_generate=1000,temperature=1.0):
    input = [char2idx[s] for s in prefix_string]
    input = tf.expand_dims(input,0)
    
    text_generated = []
    model.reset_states()
    
    for i in range(num_chars_to_generate):
        preds = model(input)
        preds = tf.squeeze(preds,0)/temperature
        pred_id = tf.random.categorical(preds,num_samples=1)[-1,0].numpy()
        text_generated.append(idx2char[pred_id])
        input = tf.expand_dims([pred_id],0)
    return prefix_string+"".join(text_generated)



In [20]:
num_epochs = 150
for i in range(num_epochs//10):
    model.fit(
        dataset.repeat(),
        epochs = 10,
        steps_per_epoch=steps_per_epoch
    )
    checkpoint_file = os.path.join(CHEKPOINT_DIR,"model_epoch_{:d}".format(i+1))
    model.save_weights(checkpoint_file)
    gen_model = CharGenModel(vocab_size,seq_length,embedding_dim)
    gen_model.load_weights(checkpoint_file)
    gen_model.build(input_shape=(1,seq_length))
    print("After epoch: {:d} ".format(i+1)*10)
    print(generate_text(gen_model,"Alice",char2idx,idx2char))
    print("---")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 After epoch: 1 
Alicescameagain.“You’relajectGutenberg™perforgertothedon’tbegivetheeptakeedagain?Ifyouuse!youreactryinamoundinthehandsnidectedthesockthakinghishemdaquessmyatthegarmshepailinstattodallsomehampthurlyatwiththebrown,allthepart.1.E.voicewoundsay."Youmustbixilenchshehadnotchivequitesaidofaysoormiddle.Doyouw,andwas,we_hedobeonecansolosetistGutenhadfeelingofherchesking,andmy_!"criedAllbelittlealittlemany-trimplikedownsoIcouldkiedlookpillusoot;andafitwasheadthatpartilformind_kno,nomaybesitsattheDropedbythisslygotinthalsardstreamingyour-pingmaricwasor_happhanfortherrese,forhisHelatterwnottitedagain,theywouldcall:sheitupleaustheseeactsselfAverourwouldersomeofit!Oh!’HumptyDumptygots!"butyou_thanquitepaininamistofinstandmourtonthanintherewas

In [1]:
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score , confusion_matrix


In [11]:
def download_and_read(url):
    # local_file = url.split('/')[-1]
    # local_file = local_file.replace('%20',' ')
    # p = tf.keras.utils.get_file(local_file, origin=url,extract=True,cache_dir=".")
    # local_folder = os.path.join("datasets",local_file.split('.')[0])
    local_folder = 'datasets/sentiment labelled sentences'
    labeled_sentences = []
    
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith('_labelled.txt'):
            with open(os.path.join(local_folder,labeled_filename),"r") as f:
                for line in f:
                    sentence,label = line.strip().split('\t')
                    labeled_sentences.append((sentence,label))
                    
    return labeled_sentences

labeled_sentences = download_and_read("https://archive.ics.uci.edu/ml/machine-learning-databases/"+
                                      "00331/sentiment%20labelled%20sentences.zip")

sentences = [s for (s,l) in labeled_sentences]
labels = [int(l) for (s,l) in labeled_sentences]

In [13]:
sentences

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.',
 'And the sound quality is great.',
 'He was very impressed when going from the original battery to the extended battery.',
 'If the two were seperated by a mere 5+ ft I started to notice excessive static and garbled sound from the headset.',
 'Very good quality though',
 'The design is very odd, as the ear "clip" is not very comfortable at all.',
 'Highly recommend for any one who has a blue tooth phone.',
 'I advise EVERYO