In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


# Step 1: Data acquisition + cleanup

In [3]:
def transform_data(df):

    # Extract labels
    Ys = df['m_label'].values

    # Extract pixel values and normalize
    pixel_columns = [f'r{r}c{c}' for r in range(20) for c in range(20)]
    pixels = df[pixel_columns].values

    # Reshape pixels to (num_samples, 20, 20) and normalize
    Xs = pixels.reshape(-1, 20, 20) / 256.0
    
    # Create dictionaries for label preprocessing
    unique_labels = sorted(set(Ys))
    label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
    index_to_label = {idx: label for idx, label in enumerate(unique_labels)}
    
    # Map labels to indices
    Ys = np.array([label_to_index[label] for label in Ys])
    
    return Xs, Ys, label_to_index, index_to_label

# Load data
df = pd.read_csv('RICHARD.csv')
Xs1, Ys1, label_to_index1, index_to_label1 = transform_data(df)

## Step 2: Build a Pytorch network

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define the Convolutional Neural Network
class CharCNN(nn.Module):
    def __init__(self, num_classes):
        super(CharCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(in_features=64 * 5 * 5, out_features=128)  # Ensure this size is correct
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 5 * 5)  # Flatten the tensor
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Prepare the dataset and dataloader
Xs1_tensor = torch.tensor(Xs1, dtype=torch.float32).reshape(-1, 1, 20, 20)
Ys1_tensor = torch.tensor(Ys1, dtype=torch.long)

dataset = TensorDataset(Xs1_tensor, Ys1_tensor)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, loss function, and optimizer
num_classes = len(label_to_index1)
model = CharCNN(num_classes=num_classes)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_images, batch_labels in data_loader:
        optimizer.zero_grad()
        predictions = model(batch_images)
        loss = loss_function(predictions, batch_labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')



Epoch [1/10], Average Loss: 5.4842
Epoch [2/10], Average Loss: 5.4336
Epoch [3/10], Average Loss: 4.8965
Epoch [4/10], Average Loss: 3.1641
Epoch [5/10], Average Loss: 1.7690
Epoch [6/10], Average Loss: 1.0553
Epoch [7/10], Average Loss: 0.7979
Epoch [8/10], Average Loss: 0.5929
Epoch [9/10], Average Loss: 0.5355
Epoch [10/10], Average Loss: 0.4259


## Step 3: Exploration and Evaluation

In [11]:
#Evaluate the network using cross validation (splitting data into training/testing). What is its accuracy?


# Define the Convolutional Neural Network
class CharCNN(nn.Module):
    def __init__(self, num_classes):
        super(CharCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(in_features=64 * 5 * 5, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 5 * 5)  # Flatten the tensor
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Prepare data
Xs1 = Xs1.reshape(-1, 1, 20, 20)  # Reshape for PyTorch
dataset1 = TensorDataset(torch.tensor(Xs1, dtype=torch.float32), torch.tensor(Ys1, dtype=torch.long))
data_loader1 = DataLoader(dataset1, batch_size=32, shuffle=True)

# Initialize model, loss function, and optimizer
num_classes1 = len(label_to_index1)
model = CharCNN(num_classes=num_classes1)
criterion = nn.CrossEntropyLoss()  # Define the loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for images, labels in data_loader1:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(data_loader1)
    print(f'Epoch [{epoch+1}/{epochs}], Average Loss: {average_loss:.4f}')

# Model evaluation
model.eval()
correct_preds = 0
total_samples = 0
with torch.no_grad():
    for images, labels in data_loader1:
        predictions = model(images)
        _, predicted_labels = torch.max(predictions, 1)
        total_samples += labels.size(0)
        correct_preds += (predicted_labels == labels).sum().item()

accuracy = 100 * correct_preds / total_samples
print(f'Accuracy: {accuracy:.2f}%')


Epoch [1/10], Average Loss: 5.4851
Epoch [2/10], Average Loss: 5.4529
Epoch [3/10], Average Loss: 5.0396
Epoch [4/10], Average Loss: 3.2411
Epoch [5/10], Average Loss: 1.6356
Epoch [6/10], Average Loss: 1.0601
Epoch [7/10], Average Loss: 0.7713
Epoch [8/10], Average Loss: 0.6204
Epoch [9/10], Average Loss: 0.4805
Epoch [10/10], Average Loss: 0.3788
Accuracy: 88.60%
