In [30]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import os
import warnings
warnings.filterwarnings('ignore')

In [63]:
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN_ENCODER(nn.Module):
    def __init__(self,
                 ntoken,
                 ninput=300,
                 drop_prob=0.5,
                 nhidden=5,  # number of embeddings
                 nlayers=1,
                 bidirectional=True):
        super(RNN_ENCODER, self).__init__()
        self.n_steps = 100
        self.ntoken = ntoken
        self.ninput = ninput
        self.drop_prob = drop_prob
        self.nlayers = nlayers
        self.bidirectional = bidirectional
        self.rnn_type = "LSTM"
        if bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        self.nhidden = nhidden // self.num_directions

        self.define_module()
        self.init_weights()

    def define_module(self):
        self.encoder = nn.Embedding(self.ntoken, self.ninput)
        self.drop = nn.Dropout(self.drop_prob)
        self.rnn = nn.LSTM(self.ninput,
                           self.nhidden,
                           self.nlayers,
                           batch_first=True,
                           dropout=self.drop_prob,
                           bidirectional=self.bidirectional)

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

        def init_hidden(self, bsz):
            weight = next(self.parameters()).data
            return (Variable(
                weight.new(self.nlayers * self.num_directions, bsz,
                           self.nhidden).zero_()),
                    Variable(
                        weight.new(self.nlayers * self.num_directions, bsz,
                                   self.nhidden).zero_()))

    def forward(self, captions, cap_lens, hidden, mask=None):
        emb = self.drop(self.encoder(captions))
        cap_lens = cap_lens.data.tolist()
        emb = pack_padded_sequence(emb, cap_lens, batch_first=True)
        output, hidden = self.rnn(emb, hidden)
        output = pad_packed_sequence(output, batch_first=True)[0]
        words_emb = output.transpose(1, 2)
        sent_emb = hidden[0].transpose(0, 1).contiguous()
        sent_emb = sent_emb.view(-1, self.nhidden * self.num_directions)
        return words_emb, sent_emb


In [75]:
class TextProcessor:
    def __init__(self, rnn_encoder):
        self.vocabulary = None
        self.max_seq_length = 100  
        self.rnn_encoder = rnn_encoder

    def tokenize(self, input_text):
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(input_text.lower())
        return tokens

    def build_vocabulary(self, all_captions):
        all_words = [word for caption in all_captions for word in caption]
n           self.vocabulary = {word: idx + 1 for idx, word in enumerate(unique_words)}
        self.vocabulary['<PAD>'] = 0
        self.reverse_vocabulary = {idx: word for word, idx in self.vocabulary.items()}

    def convert_to_indices(self, tokens):
        indices = [self.vocabulary.get(word, self.vocabulary['<PAD>']) for word in tokens]
        if len(indices) < self.max_seq_length:
            indices.extend([self.vocabulary['<PAD>']] * (self.max_seq_length - len(indices)))
        else:
            indices = indices[:self.max_seq_length]
        return indices

    def get_embeddings(self, indices):
        # Convert indices to PyTorch tensor
        indices_tensor = torch.tensor(indices).unsqueeze(0)  # Add batch dimension
        # Initialize hidden state
        hidden = self.rnn_encoder.init_hidden(1)

        # Forward pass through the RNN_ENCODER
        words_emb, sent_emb = self.rnn_encoder(indices_tensor, torch.tensor([len(indices)]), hidden)

        return sent_emb

    def print_vocabulary(self):
        if self.vocabulary is not None:
            print("Total Vocabulary Size:", len(self.vocabulary))
            print("Actual Vocabulary:")
            for word, index in self.vocabulary.items():
                print(f"{word}: {index}")
        else:
            print("Vocabulary is not built yet. Call build_vocabulary method first.")


In [None]:
class TextFileLoader:
    def __init__(self, folder_path, rnn_encoder, text_processor):
        self.folder_path = folder_path
        self.rnn_encoder = rnn_encoder
        self.text_processor = text_processor

    def load_text_files(self):
        try:
            if os.path.exists(self.folder_path) and os.path.isdir(self.folder_path):
                text_files = [file for file in os.listdir(self.folder_path) if file.endswith(".txt")]
                all_captions = []
                for file_name in text_files:
                    file_path = os.path.join(self.folder_path, file_name)
                    try:
                        with open(file_path, "r", encoding="utf-8") as file:
                            file_content = file.read()
                        tokens = self.text_processor.tokenize(file_content)
                        all_captions.append(tokens)
                    except Exception as file_error:
                        print(f"Error reading file {file_name}: {file_error}")

                if all_captions:
                    self.text_processor.build_vocabulary(all_captions)

                    for tokens in all_captions:
                        indices = self.text_processor.convert_to_indices(tokens)
                        embeddings = self.text_processor.get_embeddings(indices)
                        num_embedding_vectors = embeddings.size(0)  # Number of embedding vectors
                        num_embeddings = embeddings.size(1)  # Number of embeddings per vector
                        print("Tokens:", tokens)
                        print("Embeddings:", embeddings)
                        print("Number of Embedding Vectors:", num_embedding_vectors)
                        print("Number of Embeddings per Vector:", num_embeddings)
                        print("-" * 50)
                else:
                    print("No valid text files found in the specified folder.")
            else:
                print(f"The folder path '{self.folder_path}' is not valid.")
        except Exception as e:
            print(f"An error occurred in load_text_files: {e}")

# Example usage:
folder_path = r"C:\Users\Laptop Land\dataset"
rnn_encoder = RNN_ENCODER(ntoken=53, nhidden=128)
text_processor = TextProcessor(rnn_encoder)
loader = TextFileLoader(folder_path, rnn_encoder, text_processor)
loader.load_text_files()


Tokens: ['this', 'man', 'has', 'a', 'round', 'face', 'with', 'medium', 'hair', 'he', 'has', 'a', 'pair', 'of', 'medium', 'normal', 'eyes', 'with', 'sparse', 'thin', 'and', 'flat', 'eyebrows', 'his', 'mouth', 'is', 'thin', 'and', 'wide', 'with', 'a', 'medium', 'long', 'nose', 'and', 'his', 'ears', 'are', 'big', 'he', 'has', 'glasses', 'and', 'hasn', 't', 'beard']
Embeddings: tensor([[ 0.0858,  0.0198, -0.0457, -0.0384,  0.0449, -0.1049, -0.0555, -0.0746,
          0.0528,  0.0116,  0.1010, -0.0085, -0.0262,  0.0066,  0.0078, -0.0472,
          0.0119, -0.0920,  0.0581,  0.1046, -0.0958, -0.1544,  0.0044, -0.0407,
         -0.0918,  0.0771,  0.1121,  0.0028, -0.0042, -0.0706,  0.0897, -0.0241,
          0.0541, -0.0849, -0.1179,  0.1272, -0.0066,  0.0585,  0.0961, -0.0401,
         -0.1447,  0.0233,  0.0262, -0.0161, -0.0312,  0.0898, -0.0132, -0.0596,
         -0.0586, -0.0129,  0.0189, -0.0350, -0.0821, -0.0053, -0.0256,  0.0238,
         -0.0267,  0.1039, -0.0168, -0.0716,  0.0156,  0

In [44]:
text_processor.print_vocabulary()

Total Vocabulary Size: 53
Actual Vocabulary:
an: 1
t: 2
has: 3
round: 4
heart: 5
big: 6
is: 7
are: 8
normal: 9
small: 10
flat: 11
arched: 12
this: 13
triangle: 14
short: 15
sparse: 16
nose: 17
rectangular: 18
and: 19
ears: 20
dense: 21
thick: 22
of: 23
narrow: 24
square: 25
triangular: 26
eyebrows: 27
pair: 28
woman: 29
mouth: 30
he: 31
thin: 32
she: 33
her: 34
a: 35
inverted: 36
up: 37
his: 38
man: 39
long: 40
with: 41
eyes: 42
hasn: 43
beard: 44
face: 45
diamond: 46
glasses: 47
oval: 48
medium: 49
hair: 50
down: 51
wide: 52
<PAD>: 0


Tokens: ['this', 'man', 'has', 'a', 'oval', 'face', 'with', 'short', 'hair', 'he', 'has', 'a', 'pair', 'of', 'medium', 'normal', 'eyes', 'with', 'dense', 'thick', 'and', 'arched', 'eyebrows', 'his', 'mouth', 'is', 'thick', 'and', 'wide', 'with', 'a', 'medium', 'normal', 'nose', 'and', 'his', 'ears', 'are', 'normal', 'he', 'hasn', 't', 'glasses', 'and', 'has', 'beard']
Embeddings: tensor([[-0.0386,  0.0549,  0.0872,  0.0673, -0.0150,  0.0265,  0.1076,  0.0659,
         -0.0053, -0.0420, -0.0215,  0.0587, -0.0893, -0.0266,  0.0249,  0.0589,
          0.0099, -0.0159, -0.1148, -0.0167, -0.0291,  0.0883, -0.0238,  0.0712,
          0.0218,  0.0136, -0.0684,  0.0204,  0.0041,  0.0235, -0.0612, -0.0484,
          0.0290, -0.0707, -0.0807, -0.0459,  0.0226,  0.0123, -0.0454,  0.0904,
          0.0999, -0.0563, -0.0851,  0.0255, -0.0090, -0.0532,  0.0345,  0.0310,
         -0.0508, -0.0384, -0.0235, -0.0646,  0.0239,  0.1215,  0.0533,  0.0632,
          0.0475, -0.0436,  0.0161,  0.0503,  0.02