# MSDS 434 Pytorch Final Project Model
Notebook processes training data, and runs predictions on given dataset, and uploads predictions to S3 bucket. Upload triggers AWS lambda function, which combines all csvs in bucket and stores in seperate bucket, which will subsequently be used for visualizations in Tableau


In [None]:
# ! pip install boto3

import pandas as pd
import numpy as np
import logging
import boto3
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim

from botocore.exceptions import ClientError
from google.colab import userdata

Collecting boto3
  Downloading boto3-1.29.3-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.33.0,>=1.32.3 (from boto3)
  Downloading botocore-1.32.3-py3-none-any.whl (11.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.8.0,>=0.7.0 (from boto3)
  Downloading s3transfer-0.7.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.29.3 botocore-1.32.3 jmespath-1.0.1 s3transfer-0.7.0


In [None]:
# Load training dataset

train_url = 'https://bankmarketingkt.s3.us-west-2.amazonaws.com/train/train_df.csv'
df = pd.read_csv(train_url)

# Assuming 'target_column' is the column you want to predict

X = df.drop('y', axis=1)
y = df['y']

# Encode target labels if they are categorical

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        return out

input_size = X.shape[1]
hidden_size = 64
num_classes = len(label_encoder.classes_)

# Create the model

model = NeuralNetwork(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Learning Rate

num_epochs = 100  # Number of Epochs

for epoch in range(num_epochs):
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)

    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f'Test Accuracy: {np.round(accuracy * 100, decimals=2)}%')

Test Accuracy: 95.37%


In [None]:
# Load the new data for prediction

predict_csv = '6'
new_data_url = f'https://bankmarketingkt.s3.us-west-2.amazonaws.com/predictions/predict_df_{predict_csv}.csv'
new_data = pd.read_csv(new_data_url)
new_data = new_data.drop('y', axis=1)

# Convert new_data to PyTorch tensor

new_data_tensor = torch.tensor(new_data.values, dtype=torch.float32)

# Ensure your model is in evaluation mode

model.eval()

# Make predictions using the PyTorch model

with torch.no_grad():
    new_predictions = model(new_data_tensor)

_, predicted_classes = torch.max(new_predictions, 1)
predicted_classes = predicted_classes.numpy()

# Add the predicted values to the dataset

new_data['Predicted_y'] = predicted_classes

# Create a CSV of the dataset with predictions

new_data.to_csv(f'predict_df_{predict_csv}_convert.csv', index=False)

# Upload csv to bankmarketingktconverts S3 bucket

client = boto3.client('s3',
                      aws_access_key_id = userdata.get('AWS_KEY_ID'),
                      aws_secret_access_key = userdata.get('AWS_SECRET_ID'))
bucket = 'bankmarketingktconverts'
cur_path = os.getcwd()
filename = os.path.join(cur_path, f'predict_df_{predict_csv}_convert.csv')

data = open(filename, 'rb')

client.upload_file(filename, bucket, f'predict_df_{predict_csv}_convert.csv')

print(f'predict_df_{predict_csv}_convert.csv has been uploaded successfully to the {bucket} bucket')

predict_df_6_convert.csv has been uploaded successfully to the bankmarketingktconverts bucket
