In [1]:
%pip install torch datasets nltk

Collecting torch
  Using cached torch-2.6.0-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Using cached torch-2.6.0-cp310-cp310-win_amd64.whl (204.2 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
U

In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')

# Tokenizer-Funktion
def preprocess_text(text):
    """Tokenization und Bereinigung der Texte."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Satzzeichen entfernen
    tokens = word_tokenize(text)
    return tokens

# Padding-Funktion für dynamisches Padding innerhalb eines Batches
def pad_sequences(sequences, pad_token="<PAD>"):
    max_length = max(len(seq) for seq in sequences)
    return [seq + [pad_token] * (max_length - len(seq)) for seq in sequences]

class CommonsenseQADataset(Dataset):
    def __init__(self, split="train"):
        self.dataset = load_dataset("commonsense_qa")[split]
        self.processed_data = self.process_data()
    
    def process_data(self):
        """Fragen und Antwortmöglichkeiten verarbeiten."""
        processed = []
        for item in self.dataset:
            question = preprocess_text(item["question"])
            choices = [preprocess_text(choice) for choice in item["choices"]["text"]]
            answer = ord(item["answerKey"]) - ord('A')  # Antwort in Index umwandeln
            processed.append((question, choices, answer))
        return processed
    
    def __len__(self):
        return len(self.processed_data)
    
    def __getitem__(self, idx):
        return self.processed_data[idx]

# Collate-Funktion für den DataLoader
def collate_fn(batch):
    questions, choices, answers = zip(*batch)
    questions_padded = pad_sequences(questions)
    choices_padded = [pad_sequences(choice_list) for choice_list in choices]
    return questions_padded, choices_padded, torch.tensor(answers)

# Datenlade-Funktion mit Padding
def load_data(batch_size=16):
    """Lädt die Daten und gibt DataLoader zurück."""
    train_dataset = CommonsenseQADataset("train")
    val_dataset = CommonsenseQADataset("validation")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    return train_loader, val_loader

# Testlauf
if __name__ == "__main__":
    train_loader, val_loader = load_data()
    for batch in train_loader:
        print("Beispiel-Batch:", batch)
        break


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jonas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Beispiel-Batch: ([['the', 'air', 'conditioning', 'went', 'out', 'during', 'a', 'film', 'and', 'the', 'clientele', 'walked', 'out', 'due', 'to', 'discomfort', 'what', 'were', 'they', 'leaving', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['yesterday', 'there', 'was', 'heavy', 'rain', 'and', 'theres', 'water', 'standing', 'everywhere', 'but', 'its', 'coming', 'down', 'hard', 'again', 'and', 'this', 'time', 'its', 'cold', 'what', 'sort', 'of', 'storm', 'is', 'it', 'now'], ['where', 'is', 'a', 'good', 'place', 'to', 'sore', 'a', 'wind', 'instrument', 'in', 'you', 'home', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['what', 'would', 'killing', 'people', 'make', 'one', 'of', 'your', 'victims', 'do', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['where', 'do

In [3]:
train_loader, val_loader = load_data()
for batch in train_loader:
    print("Beispiel-Batch:", batch)
    break


Beispiel-Batch: ([['older', 'stars', 'collect', 'in', 'groups', 'called', 'what', 'on', 'the', 'outer', 'regions', 'of', 'a', 'galaxy', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['what', 'is', 'served', 'with', 'dinner', 'at', 'a', 'french', 'restaurant', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['denny', 'couldnt', 'find', 'his', 'hairbrush', 'he', 'looked', 'everywhere', 'for', 'it', 'under', 'and', 'over', 'up', 'and', 'down', 'it', 'was', 'not', 'where', 'he', 'thought', 'it', 'would', 'be', 'so', 'he', '