<a href="https://colab.research.google.com/github/taaha3244/NLP-a-day-keeps-doctors-away/blob/main/Names_Classification_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip '/content/drive/MyDrive/data.zip'

Archive:  /content/drive/MyDrive/data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [2]:
import os

# Load all names from the files in the 'names' folder and include their nationality
Path = '/content/data/names/'
file_names = os.listdir(Path)
all_names = []
nationalities = []
for filename in file_names:
    nationality = os.path.splitext(filename)[0]  # Nationality is the file name without extension
    with open(Path + filename, 'r', encoding='utf-8') as file:
        names = file.read().splitlines()
        all_names.extend(names)
        nationalities.extend([nationality] * len(names))  # Extend with the nationality for each name

# Example: Checking the total number of names and the first few names with their nationalities
total_names = len(all_names)
sample_data = list(zip(all_names[:5], nationalities[:5]))  # Pairing names with nationalities
print(total_names, sample_data)


20074 [('Adamidis', 'Greek'), ('Adamou', 'Greek'), ('Agelakos', 'Greek'), ('Akrivopoulos', 'Greek'), ('Alexandropoulos', 'Greek')]


In [3]:
def create_char_mapping(names):
    # Identify unique characters
    unique_chars = set(''.join(names))
    # Create a mapping from characters to integers
    char_to_int = {char: i for i, char in enumerate(unique_chars, 1)}  # Starting from 1 for zero-indexing
    return char_to_int

# Apply the function to your dataset
char_to_int = create_char_mapping(all_names)

# Example: Display the character-to-integer mapping
print(char_to_int)


{'ń': 1, 'ö': 2, 'k': 3, 'ß': 4, 'ê': 5, 'õ': 6, 's': 7, 'C': 8, 'ú': 9, 'ì': 10, 'G': 11, 'F': 12, 'ł': 13, 'i': 14, 'y': 15, 'ü': 16, 'I': 17, 'à': 18, 'H': 19, 'd': 20, 'ó': 21, 'w': 22, 'm': 23, 'q': 24, 'ç': 25, 'W': 26, 'f': 27, 'U': 28, 'ã': 29, 'A': 30, ':': 31, 'a': 32, 'ż': 33, 'R': 34, 'M': 35, '1': 36, 'l': 37, 'h': 38, 'g': 39, 'í': 40, 'Á': 41, 'u': 42, 'N': 43, 'T': 44, 'J': 45, 'S': 46, 'Y': 47, 'x': 48, 'D': 49, 'X': 50, 'V': 51, 'ä': 52, 'è': 53, 'O': 54, 'j': 55, "'": 56, 'Z': 57, 'P': 58, ',': 59, 'á': 60, 'Q': 61, 'o': 62, 'z': 63, 'r': 64, 'L': 65, 'ò': 66, 't': 67, 'Ś': 68, 'Ż': 69, 'e': 70, 'É': 71, 'é': 72, 'b': 73, 'c': 74, '/': 75, 'n': 76, ' ': 77, 'E': 78, 'ą': 79, 'B': 80, 'v': 81, '-': 82, 'K': 83, 'p': 84, '\xa0': 85, 'ñ': 86, 'ù': 87}


In [4]:
def create_nationality_mapping(nationalities):
    # Identify unique nationalities
    unique_nationalities = set(nationalities)
    # Create a mapping from nationalities to integers
    nationality_to_int = {nationality: i for i, nationality in enumerate(unique_nationalities)}
    return nationality_to_int

# Apply the function to your list of nationalities
nationality_to_int = create_nationality_mapping(nationalities)

# Example: Display the nationality-to-integer mapping
print(nationality_to_int)


{'German': 0, 'French': 1, 'Czech': 2, 'Korean': 3, 'Italian': 4, 'Chinese': 5, 'Scottish': 6, 'Irish': 7, 'Greek': 8, 'Russian': 9, 'Polish': 10, 'Arabic': 11, 'English': 12, 'Japanese': 13, 'Spanish': 14, 'Vietnamese': 15, 'Portuguese': 16, 'Dutch': 17}


In [5]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

# Function to encode the names and nationalities
def encode_data(names, nationalities, char_to_int, nationality_to_int):
    # Encode names
    encoded_names = [[char_to_int[char] for char in name] for name in names]
    # Encode nationalities
    encoded_nationalities = [nationality_to_int[nationality] for nationality in nationalities]
    return encoded_names, encoded_nationalities

# Applying the encoding function
encoded_names, encoded_nationalities = encode_data(all_names, nationalities, char_to_int, nationality_to_int)

# Padding sequences to have the same length
max_name_length = max([len(name) for name in encoded_names])
X = pad_sequences(encoded_names, maxlen=max_name_length, padding='post')

# Converting labels to numpy array for use with PyTorch
y = np.array(encoded_nationalities)

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Verifying the shape of the processed data
X_train.shape, X_val.shape, len(y_train), len(y_val)


((16059, 20), (4015, 20), 16059, 4015)

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class NameDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataset and dataloader
train_dataset = NameDataset(X_train, y_train)
val_dataset = NameDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [7]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, num_chars, embedding_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(num_chars, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        out = self.fc(lstm_out)
        return out

# Instantiate the model with hidden dimensions
num_chars=len(char_to_int)+1
hidden_dim = 128  # This is a hyperparameter you can tune
num_classes = len(nationality_to_int)
model = LSTMClassifier(num_chars, embedding_dim=64, hidden_dim=hidden_dim, num_classes=num_classes)


In [8]:
# Setup device-agnostic code
if torch.cuda.is_available():
    device = "cuda" # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = "mps" # Apple GPU
else:
    device = "cpu" # Defaults to CPU if NVIDIA GPU/Apple GPU aren't available

print(f"Using device: {device}")

Using device: cpu


In [9]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Checking if GPU is available and moving the model to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


LSTMClassifier(
  (embedding): Embedding(88, 64)
  (lstm): LSTM(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=18, bias=True)
)

In [10]:
# Function to perform a training epoch
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in data_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

# Function to perform a validation epoch
def validate_epoch(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
    return total_loss / len(data_loader)

# Training loop
num_epochs = 10  # Number of epochs can be adjusted
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = validate_epoch(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1: Train Loss: 1.8514, Val Loss: 1.5622
Epoch 2: Train Loss: 1.4266, Val Loss: 1.3001
Epoch 3: Train Loss: 1.1393, Val Loss: 0.9955
Epoch 4: Train Loss: 0.9040, Val Loss: 0.8153
Epoch 5: Train Loss: 0.7607, Val Loss: 0.7639
Epoch 6: Train Loss: 0.6628, Val Loss: 0.6879
Epoch 7: Train Loss: 0.5997, Val Loss: 0.7001
Epoch 8: Train Loss: 0.5422, Val Loss: 0.6479
Epoch 9: Train Loss: 0.4978, Val Loss: 0.6379
Epoch 10: Train Loss: 0.4617, Val Loss: 0.6166


In [None]:

with torch.nograd():
  output=model