In [28]:
!pip install scikit-learn



In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [30]:
path="C:\\backup\\internship\\day 7\\online+retail\\Online Retail.xlsx"

In [31]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [32]:
# Load the dataset
df = pd.read_excel(path)

# Preprocess data
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df[~df['InvoiceNo'].str.contains('C', case=False, na=False)]
df['TotalPurchaseValue'] = df['UnitPrice'] * df['Quantity']

df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')
monthly_sales = df.groupby(['CustomerID', 'YearMonth'])['TotalPurchaseValue'].sum().reset_index()
monthly_sales.columns = ['CustomerID', 'YearMonth', 'TotalMonthlySales']

invoice_summary = df.groupby(['CustomerID', 'YearMonth', 'InvoiceNo']).agg(
    TotalInvoiceValue=('TotalPurchaseValue', 'sum')
).reset_index()

customer_monthly_summary = invoice_summary.groupby(['CustomerID', 'YearMonth']).agg(
    TotalMonthlySales=('TotalInvoiceValue', 'sum'),
    NumberOfPurchases=('InvoiceNo', 'count')
).reset_index()

customer_monthly_summary['AvgPurchaseValue'] = customer_monthly_summary['TotalMonthlySales'] / customer_monthly_summary['NumberOfPurchases']

data = pd.get_dummies(customer_monthly_summary[['CustomerID', 'YearMonth', 'AvgPurchaseValue']], columns=['CustomerID', 'YearMonth'])
data['TotalMonthlySales'] = customer_monthly_summary['TotalMonthlySales']


In [33]:
# Split data into features and target
X = data.drop('TotalMonthlySales', axis=1).values
y = data['TotalMonthlySales'].values

# Scale the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Use smaller sequence length and batch size to reduce memory usage
sequence_length = 6  # Reduced sequence length
batch_size = 2  # Reduced batch size

# Prepare the sequences for RNN input
def create_sequences(data, target, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        target_seq = target[i + seq_length]
        sequences.append(seq)
        targets.append(target_seq)
    return np.array(sequences), np.array(targets)

X_train_seq, y_train_seq = create_sequences(X_train, y_train, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, sequence_length)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_seq, dtype=torch.float32).to(device)

# Create DataLoader for training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


MemoryError: Unable to allocate 434. MiB for an array with shape (4353, 13055) and data type object

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size).to(device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Only take the output from the last time step
        return out

# Hyperparameters
input_size = X_train_tensor.shape[2]
hidden_size = 50
output_size = 1
num_epochs = 20

# Initialize model, loss function, and optimizer
model = RNNModel(input_size, hidden_size, output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(num_epochs):
    for i, (seq, target) in enumerate(train_loader):
        # Forward pass
        outputs = model(seq)
        loss = criterion(outputs.squeeze(), target)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# Predict future sales
def predict_future_sales(model, last_sequence, num_months, scaler_X, scaler_y):
    model.eval()
    predictions = []
    current_sequence = torch.tensor(last_sequence, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        for _ in range(num_months):
            predicted_scaled = model(current_sequence.unsqueeze(0))
            predicted_value = scaler_y.inverse_transform(predicted_scaled.cpu().numpy())
            predictions.append(predicted_value.flatten()[0])
            
            # Update the sequence by appending the latest prediction and removing the oldest value
            new_sequence = np.append(current_sequence.cpu().numpy()[:, 1:, :], predicted_scaled.cpu().numpy().reshape(1, 1, -1), axis=1)
            current_sequence = torch.tensor(new_sequence, dtype=torch.float32).to(device)

    return predictions

# Use the last sequence from the training data for prediction
last_sequence = X_scaled[-sequence_length:]
last_sequence = last_sequence.reshape((1, sequence_length, X_scaled.shape[1]))

# Predict future sales
predicted_1_month = predict_future_sales(model, last_sequence, 1, scaler_X, scaler_y)
print(f'Predicted sales for the next month: {predicted_1_month[0]}')

predicted_3_months = predict_future_sales(model, last_sequence, 3, scaler_X, scaler_y)
print(f'Predicted sales for the next 3 months: {predicted_3_months}')

predicted_1_year = predict_future_sales(model, last_sequence, 12, scaler_X, scaler_y)
print(f'Predicted sales for the next 12 months: {predicted_1_year}')
