In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [7]:
x_train = pd.read_csv('X_train_NHkHMNU.csv')
y_train = pd.read_csv('y_train_ZAN5mwg.csv')
x_test = pd.read_csv('X_test_final.csv')

## Data cleaning (Yiru's code)

In [17]:
x_train.interpolate(method='polynomial',order=3, inplace=True)
x_train_clean = x_train.drop(['COUNTRY'], axis=1).fillna(0)

x_test.interpolate(method='polynomial',order=3, inplace=True)
x_test_clean = x_test.drop(['COUNTRY'], axis=1).fillna(0)

y_train_clean = y_train['TARGET']


missing_values_x_train_clean = x_train_clean.isnull().sum()
print(missing_values_x_train_clean)
print("========================================")

missing_values_y_train_clean = y_train_clean.isnull().sum()
print(missing_values_y_train_clean)


print("\nDataFrame Info:\n", x_train.info())

shape = x_train.shape


print("DataFrame Shape:", shape)

ID                  0
DAY_ID              0
DE_CONSUMPTION      0
FR_CONSUMPTION      0
DE_FR_EXCHANGE      0
FR_DE_EXCHANGE      0
DE_NET_EXPORT       0
FR_NET_EXPORT       0
DE_NET_IMPORT       0
FR_NET_IMPORT       0
DE_GAS              0
FR_GAS              0
DE_COAL             0
FR_COAL             0
DE_HYDRO            0
FR_HYDRO            0
DE_NUCLEAR          0
FR_NUCLEAR          0
DE_SOLAR            0
FR_SOLAR            0
DE_WINDPOW          0
FR_WINDPOW          0
DE_LIGNITE          0
DE_RESIDUAL_LOAD    0
FR_RESIDUAL_LOAD    0
DE_RAIN             0
FR_RAIN             0
DE_WIND             0
FR_WIND             0
DE_TEMP             0
FR_TEMP             0
GAS_RET             0
COAL_RET            0
CARBON_RET          0
dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494 entries, 0 to 1493
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1494 non-null

## Fitting the model

### Run this for calibration

In [126]:
#Splitting the data into training and test sets
x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(x_train_clean, y_train_clean, test_size=0.2, random_state=42)

#For model calibration
X_train_array = x_train_final.values
y_train_array = y_train_final.values

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32)
y_train_tensor = y_train_tensor.unsqueeze(1)

### Run this for official testing

In [119]:
#Convert dataframes to numpy arrays
X_train_array = x_train_clean.values
y_train_array = y_train_clean.values

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32)
y_train_tensor = y_train_tensor.unsqueeze(1)

In [67]:
X_train_array.shape

(1195, 34)

In [127]:
#Defining the model
class FeedForwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [128]:
# Shuffle data before splitting (optional but recommended)
#indices = np.random.permutation(len(x_train_clean))
#X_train_shuffled = x_train_clean.iloc[indices]  # Use iloc to access rows by integer location
#y_train_shuffled = x_train_clean.iloc[indices]

# Define hyperparameters
input_dim = x_train_clean.shape[1]
hidden_dim = 64
output_dim = 1  # Number of output neurons (for regression)
learning_rate = 0.001
num_epochs = 30
batch_size = 32

# Initialize k-fold cross-validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)

# Initialize lists to store evaluation metrics
mse_scores = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(kf.split(x_train_clean)):
    print(f"Fold {fold + 1}/{k_folds}")
    
    # Shuffle data before splitting
    indices = np.random.permutation(len(x_train_clean))
    x_train_shuffled = x_train_clean.iloc[indices]
    y_train_shuffled = y_train_clean.iloc[indices]
    
    # Split data into training and validation sets
    X_train_fold, X_val_fold = x_train_shuffled.iloc[train_indices], x_train_shuffled.iloc[val_indices]
    y_train_fold, y_val_fold = y_train_shuffled.iloc[train_indices], y_train_shuffled.iloc[val_indices]
    
    # Convert data to compatible data types
    X_train_fold = X_train_fold.astype('float32')
    y_train_fold = y_train_fold.astype('float32')
    X_val_fold = X_val_fold.astype('float32')
    y_val_fold = y_val_fold.astype('float32')
    
    # Convert data to PyTorch tensors
    train_data = TensorDataset(torch.tensor(X_train_fold.values, dtype=torch.float32),
                               torch.tensor(y_train_fold.values, dtype=torch.float32))
    val_data = TensorDataset(torch.tensor(X_val_fold.values, dtype=torch.float32),
                             torch.tensor(y_val_fold.values, dtype=torch.float32))
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size)

    # Initialize model
    model = FeedForwardNN(input_dim, hidden_dim, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))  # Reshape labels to match output dimension
            loss.backward()
            optimizer.step()
    
    # Evaluate model
    model.eval()
    mse = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            mse += criterion(outputs, labels.view(-1, 1)).item() / len(val_loader)
    mse_scores.append(mse)
    
# Calculate average MSE across all folds
average_mse = sum(mse_scores) / len(mse_scores)
print(f"Average MSE: {average_mse}")


Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Average MSE: 1.2670537364482881


### TESTING USING THE ARTIFICIAL TEST SET

In [129]:
# Convert the test set to PyTorch tensor
X_test_tensor = torch.tensor(x_test_final.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_final.values, dtype=torch.float32)

# Convert data to PyTorch tensor
test_data = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader for the test set
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Evaluate model on test set
model.eval()
mse_test = 0.0
num_samples = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        # Reshape the output tensor if needed
        outputs = outputs.view(-1)  # Remove the extra dimension
        # Reshape the labels tensor to match the shape of the outputs
        labels = labels.view(-1)   # Ensure labels have the same shape as outputs
        mse_test += criterion(outputs, labels).item() * len(inputs)
        num_samples += len(inputs)
mse_test /= num_samples

# Calculate RMSE
rmse_test = np.sqrt(mse_test)

print(f"Out-of-Sample MSE: {mse_test}")
print(f"Out-of-Sample RMSE: {rmse_test}")

Out-of-Sample MSE: 1.270296509847992
Out-of-Sample RMSE: 1.1270743142526103


### CREATING ESTIMATES FOR THE OFFICIAL TESTING

In [123]:
# Convert the test set to PyTorch tensor
X_test_tensor = torch.tensor(x_test_clean.values, dtype=torch.float32)

# Make predictions using the trained model
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)

# Convert the tensor predictions to a NumPy array
y_pred = y_pred_tensor.numpy()

# Load the "ID" column from the original test data
id_column = x_test['ID']

# Create a DataFrame to store the predictions along with the "ID" column
predictions_df = pd.DataFrame({'ID': id_column, 'TARGET': y_pred.flatten()})

# Save the predictions DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)
predictions_df.head()

Unnamed: 0,ID,TARGET
0,1115,0.503993
1,1202,0.866817
2,1194,0.466617
3,1084,0.436126
4,1135,0.281413


In [131]:
# Function to calculate feature importance
def calculate_feature_importance(model, X_val, y_val):
    feature_importance = {}
    model.eval()
    with torch.no_grad():
        baseline_preds = model(X_val)
        baseline_mse = mean_squared_error(y_val, baseline_preds)
        for i, col in enumerate(X_val.columns):
            X_val_shuffled = X_val.copy()
            X_val_shuffled[col] = shuffle(X_val[col])
            shuffled_preds = model(X_val_shuffled)
            shuffled_mse = mean_squared_error(y_val, shuffled_preds)
            feature_importance[col] = baseline_mse - shuffled_mse
    return feature_importance

# Assuming you have X_val and y_val as your validation data
# Assuming your model is already trained and stored in the variable `model`

# Convert validation data to PyTorch tensor
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

# Calculate feature importance
feature_importance = calculate_feature_importance(model, X_val_tensor, y_val_tensor)

# Rank features based on importance
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

NameError: name 'X_val' is not defined

# CODE WITHOUT CROSS VALIDATION

In [51]:
# Define hyperparameters
input_dim = X_train_tensor.shape[1]  # Number of input features  # Number of input features
hidden_dim = 64  # Number of neurons in the hidden layer
output_dim = 1  # Number of output neurons (for regression)

learning_rate = 0.001
num_epochs = 10
batch_size = 32


# Create DataLoader for batching and shuffling the data
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Instantiate the model
model = FeedForwardNN(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred = model(X_train_tensor).numpy()

# Make predictions (if needed)
# y_pred = model(new_data_tensor).numpy()

# Now you have y_pred containing the predicted values

## Running the model on test data

In [52]:
X_test_tensor = torch.tensor(x_test_clean.values, dtype=torch.float32)

# Make predictions using the trained model
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)

# Convert the tensor predictions to a NumPy array
y_pred = y_pred_tensor.numpy()

# Load the "ID" column from the original test data
id_column = x_test['ID']

# Create a DataFrame to store the predictions along with the "ID" column
predictions_df = pd.DataFrame({'ID': id_column, 'TARGET': y_pred.flatten()})

# Save the predictions DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)