In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from ndlinear import NdLinear
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# 1. Load the WineQT dataset
data = pd.read_csv("WineQT.csv")

# 2. Preprocess the dataset
# Print column names and dataset shape to debug
print("Columns in dataset:", data.columns.tolist())
print("Dataset shape:", data.shape)

# Verify dataset size (WineQT should have ~1143 rows)
expected_rows = 1143
if abs(len(data) - expected_rows) > 10:  # Allow small tolerance for variations
    raise ValueError(f"Unexpected dataset size: {len(data)}. Expected ~{expected_rows} rows for WineQT. Please check the file 'C:/Users/vibhu/Download/archive/WineQT.csv'.")

# Drop irrelevant columns (e.g., 'Id') if present
if 'Id' in data.columns:
    data = data.drop(['Id'], axis=1)

# Check for missing values
if data.isnull().any().any():
    print("Warning: Missing values detected. Dropping rows with missing values.")
    data = data.dropna()

# Features and target (predict 'quality')
if 'quality' not in data.columns:
    raise KeyError("'quality' column not found. Available columns: " + str(data.columns.tolist()))
X = data.drop(['quality'], axis=1).values
y = data['quality'].values

# Split and standardize
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

# Print tensor shapes to debug
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)

# 3. NdLinear Model (Ensemble AI)
class NdLinearModel(nn.Module):
    def __init__(self, input_dim):
        super(NdLinearModel, self).__init__()
        self.ndlinear = NdLinear(input_dims=(input_dim,), hidden_size=(64,))
        self.output = nn.Linear(64, 1)
    
    def forward(self, x):
        x = x.unsqueeze(-1)  # Shape: (batch_size, input_dim, 1)
        x = self.ndlinear(x)  # Shape: (batch_size, 64, 1)
        x = x.squeeze(-1)  # Shape: (batch_size, 64)
        x = torch.relu(x)
        x = self.output(x)  # Shape: (batch_size, 1)
        return x

# Train NdLinear model
input_dim = X_train_scaled.shape[1]
ndlinear_model = NdLinearModel(input_dim=input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(ndlinear_model.parameters(), lr=0.001)

for epoch in range(200):
    ndlinear_model.train()
    optimizer.zero_grad()
    outputs = ndlinear_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Evaluate NdLinear
ndlinear_model.eval()
with torch.no_grad():
    y_pred_ndlinear = ndlinear_model(X_test_tensor).numpy().flatten()
    y_pred_ndlinear = np.clip(y_pred_ndlinear, 3, 8)  # Clip to valid quality range
mse_ndlinear = mean_squared_error(y_test, y_pred_ndlinear)

# 4. nn.Linear Model
class LinearModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_dim, 64)
        self.output = nn.Linear(64, 1)
    
    def forward(self, x):
        x = self.linear(x)
        x = torch.relu(x)
        x = self.output(x)
        return x

# Train nn.Linear model
linear_model = LinearModel(input_dim=input_dim)
optimizer = optim.Adam(linear_model.parameters(), lr=0.001)

for epoch in range(200):
    linear_model.train()
    optimizer.zero_grad()
    outputs = linear_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Evaluate nn.Linear
linear_model.eval()
with torch.no_grad():
    y_pred_linear = linear_model(X_test_tensor).numpy().flatten()
    y_pred_linear = np.clip(y_pred_linear, 3, 8)  # Clip to valid quality range
mse_linear = mean_squared_error(y_test, y_pred_linear)

# 6. Compare results
print(f"NdLinear MSE: {mse_ndlinear:.4f}")
print(f"nn.Linear MSE: {mse_linear:.4f}")
if mse_ndlinear < mse_linear:
    print("NdLinear outperforms nn.Linear!")
else:
    print("nn.Linear outperforms NdLinear.")

Columns in dataset: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'Id']
Dataset shape: (1143, 13)
X_train_tensor shape: torch.Size([914, 11])
y_train_tensor shape: torch.Size([914, 1])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (10054x1 and 11x64)