In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import random

# Load data
df = pd.read_csv('/content/drive/MyDrive/names.csv')
names = df['Name'].astype(str).str.lower().values

# Character mapping
chars = sorted(set(''.join(names)))
char_to_int = {c: i for i, c in enumerate(chars, start=1)}
char_to_int['<PAD>'] = 0  # Padding token
int_to_char = {i: c for c, i in char_to_int.items()}

# Parameters
max_length = max(len(name) for name in names)
vocab_size = len(char_to_int)

# Custom Dataset
class NamesDataset(Dataset):
    def __init__(self, names, char_to_int, max_length):
        self.names = names
        self.char_to_int = char_to_int
        self.max_length = max_length

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        x = [self.char_to_int[c] for c in name[:-1]]
        y = [self.char_to_int[c] for c in name[1:]]
        x = [0] * (self.max_length - len(x)) + x
        y = [0] * (self.max_length - len(y)) + y
        return torch.tensor(x), torch.tensor(y)

# Split the dataset into training, validation, and testing sets
def train_val_test_split(names, val_size=0.1, test_size=0.1):
    random.shuffle(names)
    test_split_idx = int(len(names) * (1 - test_size))
    val_split_idx = int(len(names) * (1 - test_size - val_size))
    return names[:val_split_idx], names[val_split_idx:test_split_idx], names[test_split_idx:]

train_names, val_names, test_names = train_val_test_split(names)

# Create datasets and dataloaders
train_dataset = NamesDataset(train_names, char_to_int, max_length)
val_dataset = NamesDataset(val_names, char_to_int, max_length)
test_dataset = NamesDataset(test_names, char_to_int, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# RNN Model
class NameGeneratorRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NameGeneratorRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size), torch.zeros(1, batch_size, self.hidden_size))

# Hyperparameters
input_size = vocab_size
hidden_size = 128
output_size = vocab_size

model = NameGeneratorRNN(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_train_loss = 0
    for inputs, targets in train_dataloader:
        model.zero_grad()
        batch_size = inputs.size(0)
        hidden = model.init_hidden(batch_size)  # Initialize hidden with batch size

        loss = 0
        inputs = inputs.transpose(0, 1)  # For LSTM: seq_len x batch_size
        targets = targets.transpose(0, 1)

        for i in range(inputs.size(0)):
            output, hidden = model(inputs[i].unsqueeze(0), hidden)
            loss += criterion(output.squeeze(0), targets[i])

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {total_train_loss/len(train_dataloader)}')


    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_dataloader:
            batch_size = inputs.size(0)
            hidden = model.init_hidden(batch_size)

            loss = 0
            inputs = inputs.transpose(0, 1)
            targets = targets.transpose(0, 1)

            for i in range(inputs.size(0)):
                output, hidden = model(inputs[i].unsqueeze(0), hidden)
                loss += criterion(output.squeeze(0), targets[i])

            total_val_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {total_val_loss/len(val_dataloader)}')

# Generate names
def generate_similar_names(seed, max_length=8, num_samples=5):
    model.eval()  # Set model to evaluation mode
    input_sequence = [char_to_int.get(char, 0) for char in seed]  # Handle unknown chars with padding
    generated_names = []

    for _ in range(num_samples):
        current_name = seed
        hidden = model.init_hidden(1)  # Initialize hidden for a single sequence

        for i in range(len(seed), max_length):
            input_char = torch.tensor([char_to_int.get(current_name[-1], 0)]).unsqueeze(0)
            output, hidden = model(input_char, hidden)
            output_dist = torch.softmax(output.squeeze(0), dim=1).data
            top_i = torch.multinomial(output_dist[-1], 1)[0]

            predicted_char = int_to_char[top_i.item()]
            if predicted_char == '<PAD>':
                break
            current_name += predicted_char

        generated_names.append(current_name)

    return generated_names





Epoch 1/20, Train Loss: 16.123420727033558
Epoch 1/20, Validation Loss: 12.14051950545538
Epoch 2/20, Train Loss: 11.746169570033535
Epoch 2/20, Validation Loss: 11.501066707429432
Epoch 3/20, Train Loss: 11.29504296238437
Epoch 3/20, Validation Loss: 11.224675950549898
Epoch 4/20, Train Loss: 11.020053892779204
Epoch 4/20, Validation Loss: 11.043415887015206
Epoch 5/20, Train Loss: 10.786331709177215
Epoch 5/20, Validation Loss: 10.90342916761126
Epoch 6/20, Train Loss: 10.589723826917403
Epoch 6/20, Validation Loss: 10.832428114754814
Epoch 7/20, Train Loss: 10.417540971486847
Epoch 7/20, Validation Loss: 10.747848329089937
Epoch 8/20, Train Loss: 10.258356954422466
Epoch 8/20, Validation Loss: 10.652052334376744
Epoch 9/20, Train Loss: 10.13851047001002
Epoch 9/20, Validation Loss: 10.616367067609515
Epoch 10/20, Train Loss: 9.989524648233425
Epoch 10/20, Validation Loss: 10.56714271363758
Epoch 11/20, Train Loss: 9.902490879129047
Epoch 11/20, Validation Loss: 10.493872914995466
Ep

In [None]:
# Example usage
print(generate_similar_names('aryan', num_samples=3))

['aryant)t', 'aryanaze', 'aryandvn']
