In [3]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import os

class RNN_ENCODER(nn.Module):
    def __init__(self,
                 ntoken,
                 ninput=300,
                 drop_prob=0.5,
                 nhidden=128,
                 nlayers=1,
                 bidirectional=True):
        super(RNN_ENCODER, self).__init__()
        self.n_steps = 100
        self.ntoken = ntoken
        self.ninput = ninput
        self.drop_prob = drop_prob
        self.nlayers = nlayers
        self.bidirectional = bidirectional
        self.rnn_type = "LSTM"
        if bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        self.nhidden = nhidden // self.num_directions

        self.define_module()
        self.init_weights()

    def define_module(self):
        self.encoder = nn.Embedding(self.ntoken, self.ninput)
        self.drop = nn.Dropout(self.drop_prob)
        if self.rnn_type == 'LSTM':
            self.rnn = nn.LSTM(self.ninput,
                               self.nhidden,
                               self.nlayers,
                               batch_first=True,
                               dropout=self.drop_prob,
                               bidirectional=self.bidirectional)
        else:
            raise NotImplementedError("Not implemented .")

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(
                weight.new(self.nlayers * self.num_directions, bsz,
                           self.nhidden).zero_()),
                    Variable(
                        weight.new(self.nlayers * self.num_directions, bsz,
                                   self.nhidden).zero_()))
        else:
            raise NotImplementedError("Not implemented.")

    def forward(self, captions, cap_lens, hidden, mask=None):
        try:
            emb = self.drop(self.encoder(captions))
            cap_lens = cap_lens.data.tolist()
            emb = pack_padded_sequence(emb, cap_lens, batch_first=True)
            output, hidden = self.rnn(emb, hidden)
            output = pad_packed_sequence(output, batch_first=True)[0]
            words_emb = output.transpose(1, 2)
            if self.rnn_type == 'LSTM':
                sent_emb = hidden[0].transpose(0, 1).contiguous()
            else:
                sent_emb = hidden.transpose(0, 1).contiguous()
            sent_emb = sent_emb.view(-1, self.nhidden * self.num_directions)
            return words_emb, sent_emb
        except Exception as e:
            print(f"An error occurred in the forward pass: {e}")
            return None, None


In [4]:
from nltk.tokenize import RegexpTokenizer
import torch

class TextProcessor:
    def __init__(self, rnn_encoder):
        self.vocabulary = None
        self.max_seq_length = 30  # Adjust this based on your dataset
        self.rnn_encoder = rnn_encoder

    def tokenize(self, input_text):
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(input_text.lower())
        return tokens

    def build_vocabulary(self, all_captions):
        all_words = [word for caption in all_captions for word in caption]
        unique_words = list(set(all_words))
        self.vocabulary = {word: idx + 1 for idx, word in enumerate(unique_words)}
        self.vocabulary['<PAD>'] = 0
        self.reverse_vocabulary = {idx: word for word, idx in self.vocabulary.items()}

    def convert_to_indices(self, tokens):
        indices = [self.vocabulary.get(word, self.vocabulary['<PAD>']) for word in tokens]
        if len(indices) < self.max_seq_length:
            indices.extend([self.vocabulary['<PAD>']] * (self.max_seq_length - len(indices)))
        else:
            indices = indices[:self.max_seq_length]
        return indices

    def get_embeddings(self, indices):
        try:
            # Convert indices to PyTorch tensor
            indices_tensor = torch.tensor(indices).unsqueeze(0)  # Add batch dimension
            # Initialize hidden state
            hidden = self.rnn_encoder.init_hidden(1)

            # Forward pass through the RNN_ENCODER
            words_emb, sent_emb = self.rnn_encoder(indices_tensor, torch.tensor([len(indices)]), hidden)

            return words_emb
        except Exception as e:
            print(f"An error occurred in get_embeddings: {e}")
            return None

    def print_vocabulary(self):
        if self.vocabulary is not None:
            print("Total Vocabulary Size:", len(self.vocabulary))
            print("Actual Vocabulary:")
            for word, index in self.vocabulary.items():
                print(f"{word}: {index}")
        else:
            print("Vocabulary is not built yet. Call build_vocabulary method first.")


In [5]:
class TextFileLoader:
    def __init__(self, folder_path, rnn_encoder, text_processor):
        self.folder_path = folder_path
        self.rnn_encoder = rnn_encoder
        self.text_processor = text_processor  # obj of text processor class+

    def load_text_files(self):
        try:
            if os.path.exists(self.folder_path) and os.path.isdir(self.folder_path):  # checking the path
                text_files = [file for file in os.listdir(self.folder_path) if file.endswith(".txt")]  # list comprehension
                all_captions = []
                for file_name in text_files:
                    file_path = os.path.join(self.folder_path, file_name)
                    with open(file_path, "r") as file:
                        file_content = file.read()
                    tokens = self.text_processor.tokenize(file_content)
                    all_captions.append(tokens)

                self.text_processor.build_vocabulary(all_captions)

                for tokens in all_captions:
                    indices = self.text_processor.convert_to_indices(tokens)
                    embeddings = self.text_processor.get_embeddings(indices)
                    print("Tokens:", tokens)
                    print("Embeddings:", embeddings)
                    print("-" * 50)

            else:
                print(f"The folder path '{self.folder_path}' is not valid.")
        except Exception as e:
            print(f"An error occurred in load_text_files: {e}")

# Example usage:
folder_path ="/media/osama/26D25BC6D25B993F1/Seven Semester/Fyp/LAGAN-main/data/sketches/sketches/text"
rnn_encoder = RNN_ENCODER(ntoken=53, ninput=5)
text_processor = TextProcessor(rnn_encoder)
loader = TextFileLoader(folder_path, rnn_encoder, text_processor)
loader.load_text_files()




Tokens: ['this', 'woman', 'has', 'a', 'rectangular', 'face', 'with', 'long', 'hair', 'she', 'has', 'a', 'pair', 'of', 'big', 'wide', 'eyes', 'with', 'dense', 'thin', 'and', 'flat', 'eyebrows', 'her', 'mouth', 'is', 'thick', 'and', 'wide', 'with', 'a', 'big', 'long', 'nose', 'and', 'her', 'ears', 'are', 'normal', 'she', 'hasn', 't', 'glasses', 'and', 'hasn', 't', 'beard']
Embeddings: tensor([[[ 0.0020,  0.0033,  0.0107,  ...,  0.0091,  0.0090,  0.0051],
         [-0.0136, -0.0167, -0.0205,  ..., -0.0089, -0.0074, -0.0147],
         [ 0.0204,  0.0319,  0.0392,  ...,  0.0604,  0.0540,  0.0530],
         ...,
         [-0.0714, -0.0693, -0.0662,  ..., -0.0610, -0.0517, -0.0309],
         [-0.1057, -0.1066, -0.1074,  ..., -0.0786, -0.0645, -0.0400],
         [-0.0144, -0.0175, -0.0194,  ..., -0.0204, -0.0158, -0.0100]]],
       grad_fn=<TransposeBackward0>)
--------------------------------------------------
Tokens: ['this', 'woman', 'has', 'an', 'inverted', 'triangle', 'face', 'with', 'shor

In [30]:
text_processor.print_vocabulary()

Total Vocabulary Size: 53
Actual Vocabulary:
wide: 1
oval: 2
flat: 3
hasn: 4
his: 5
thick: 6
heart: 7
down: 8
round: 9
an: 10
long: 11
ears: 12
big: 13
glasses: 14
short: 15
diamond: 16
small: 17
nose: 18
narrow: 19
up: 20
with: 21
pair: 22
of: 23
normal: 24
rectangular: 25
inverted: 26
arched: 27
beard: 28
he: 29
thin: 30
is: 31
dense: 32
man: 33
a: 34
mouth: 35
medium: 36
has: 37
t: 38
square: 39
are: 40
eyebrows: 41
face: 42
and: 43
woman: 44
she: 45
eyes: 46
this: 47
hair: 48
her: 49
triangle: 50
sparse: 51
triangular: 52
<PAD>: 0
