In [None]:
%pip install transformers torch 

In [2]:

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pickle
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#from sentence_transformers import SentenceTransformer

import os

# Path to input and output
input_path = "/Users/dn10/Downloads/Bsub_dataset/data.jsonl.gz"
output_path = "/Users/dn10/Downloads/Bsub_dataset/filtered_under_1GB.jsonl"

# Target size limit (in bytes) - 5GB
size_limit = 5 * 1024 * 1024 * 1024  # 5 GB

# Open output file for writing
with open(output_path, "w") as output_file:
    for chunk in pd.read_json(input_path, 
                              lines=True, 
                              compression='gzip', 
                              chunksize=100000):

        # Filter rows that contain #BSUB in 'Command' column
        filtered_chunk = chunk[chunk['Command'].str.contains('#BSUB', case=False, na=False)]

        # Write to file in JSONL format
        filtered_chunk.to_json(output_file, orient='records', lines=True)

        # Check file size after writing
        current_size = os.path.getsize(output_path)
        print(f"Written so far: {round(current_size / (1024 ** 2), 2)} MB")

        if current_size >= size_limit:
            print("File size limit reached. Stopping.")
            break


Written so far: 319.74 MB
Written so far: 573.07 MB
Written so far: 772.61 MB
Written so far: 911.79 MB
Written so far: 1070.62 MB
Written so far: 1425.34 MB
Written so far: 1822.23 MB
Written so far: 2215.33 MB
Written so far: 2578.79 MB
Written so far: 2912.58 MB
Written so far: 3231.09 MB
Written so far: 3483.26 MB
Written so far: 3629.16 MB
Written so far: 3750.34 MB
Written so far: 3784.89 MB
Written so far: 4043.36 MB
Written so far: 4348.55 MB
Written so far: 4673.26 MB
Written so far: 4811.32 MB
Written so far: 4970.51 MB
Written so far: 5093.94 MB
Written so far: 5171.45 MB
File size limit reached. Stopping.


In [None]:
df = pd.read_json(output_path, lines=True)
len(df)

In [None]:
df.head(5)

In [None]:
df.info()
df.describe()
df.isna().sum()

In [None]:
#plotting a histogram
plt.hist(df["MAX_MEM_USAGE_MB"], bins=50)

In [None]:
#%pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x=df['MAX_MEM_USAGE_MB'])
plt.xscale('log')  # Optional: helps if values vary a lot
plt.title("Boxplot of Memory Usage")
plt.show()

In [None]:
df['MAX_MEM_USAGE_MB'].quantile([0.25, 0.5, 0.75, 0.90, 0.95, 0.99, 0.999])


In [None]:
def preprocess_data(df, column= "MAX_MEM_USAGE_MB", min_mem_mb = 1.0, quantile = 0.99, bins =100, samples_per_bin=1000, random_state=42):
    """
    Preprocess the data by filtering and binning.
    
    Args:
        df (pd.DataFrame): DataFrame containing the data.
        column (str): Column name to filter on.
        min_mem_mb (float): Minimum memory usage in MB.
        quantile (float): Quantile to filter on.
        bins (int): Number of bins for histogram.
        samples_per_bin (int): Number of samples per bin.
        random_state (int): Random state for reproducibility.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    # Filter jobs with low memory
    df = df[df[column] >= min_mem_mb].copy()
    
    # Calculate the upper bound using quantile
    upper_bound = df[column].quantile(quantile)
    
    # Further filter rows based on the upper bound
    df = df[df[column] <= upper_bound].reset_index(drop=True)
    
    # Bin the data
    #df['bin'] = pd.qcut(df[column], q=bins, duplicates='drop')
    df['bin'] = pd.cut(df[column], bins=bins, duplicates='drop')
    
    # Sample from each bin
    sampled_df = df.groupby('bin').apply(lambda x: x.sample(min(len(x), samples_per_bin), random_state=random_state))
    
    # drop the bin column
    sampled_df = sampled_df.reset_index(drop=True)
    sampled_df = sampled_df.drop(columns=['bin'])
    # Reset index
    sampled_df = sampled_df.reset_index(drop=True)
    
    return sampled_df

In [None]:
df_balanced = preprocess_data(df, column="MAX_MEM_USAGE_MB", min_mem_mb=1.0, quantile=0.99, bins=100, samples_per_bin=1000, random_state=42)
print(f"Number of rows after preprocessing: {len(df_balanced)}")
df_balanced.head(5)

In [None]:
df_balanced['MAX_MEM_USAGE_MB'].hist(bins=50, edgecolor='black')
#groupby;sampling;reformatting.
plt.show()

In [None]:
len(df)

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2')
encoded_input = tokenizer(df['Command'][0], padding=True, truncation=True, return_tensors='pt')
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
print(encoded_input[0])




In [None]:
# Tokenize each command in the DataFrame
def get_embedding(command):
    encoded_input = tokenizer(command, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform mean pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    # Return the sentence embedding (flattened to 1D tensor)
    return embeddings

In [None]:
# Apply the embedding function to each row in the DataFrame
df_balanced["Embeddings"] = df_balanced["Command"].apply(lambda x: get_embedding(x))


In [None]:
import pickle
#Storing df with embeddings in a pickle file
with open('df_embeddings.pickle','wb') as file:
    pickle.dump(df_balanced,file)



In [None]:
with open ('df_embeddings.pickle', 'rb') as file:
    df = pickle.load(file)

In [None]:
print(df_balanced.head(5))

In [None]:
df_balanced.MAX_MEM_USAGE_MB.median()

In [None]:
(df_balanced['MAX_MEM_USAGE_MB'] > 10000).sum()

In [None]:
def testing(df):
    # getting the first embedding element
    df["emb_0"] = df["Embeddings"].apply(lambda x: x[0][0].item())
    print(df["emb_0"])
    print(df["MAX_MEM_USAGE_MB"])
    #scatter plot for the first embedding and memory
    plt.scatter(df["emb_0"], df["MAX_MEM_USAGE_MB"], alpha=0.5)
    plt.xlabel(f"First Embedding dimension")
    plt.ylabel("Memory")
    plt.title("Scatter plot of first embedding dimension vs Memory")
    plt.show()
testing(df_balanced)


In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np


def testing(df):
    # getting the first embedding element
    df["emb_0"] = df["Embeddings"].apply(lambda x: x[0][0].item())
    print(df["emb_0"])
    #print(df["MAX_MEM_USAGE_MB"])
    X = df[["emb_0"]].values  
    # Transform the target variable using natural log plus one
    y = np.log1p(df["MAX_MEM_USAGE_MB"].values)
    
    print(X.shape)
    print(y.shape)

    model = LinearRegression()
    model.fit(X, y)

    # Step 4: Predict on training data
    y_pred = model.predict(X)

    # Step 5: Plot prediction vs actual
    plt.figure(figsize=(8, 5))
    plt.scatter(y_pred, y, alpha=0.3, c='royalblue', label='Predicted', marker='o')

    plt.legend()
    plt.xlabel("Predicted Memory Used from emb_0")
    plt.ylabel("Actual memory used")
    plt.title("Predicted vs Actual (using emb_0 only)")
    
    plt.tight_layout()
    plt.show()

    print("Done testing(). Model coefficient:", model.coef_, "Intercept:", model.intercept_)
testing(df_balanced)

In [None]:
# Convert tensors in the Embeddings column to lists
df["Embeddings"] = df["Embeddings"].apply(lambda x: [tensor.tolist() for tensor in x])

# Save to new JSON file
df.to_json("/Users/dn10/Downloads/Bsub_dataset/data_with_embeddings.jsonl.gz", orient='records', lines=True, compression='gzip')


In [None]:
# Load the DataFrame with embeddings
#Cleaning the data
df = pd.read_json("/Users/dn10/Downloads/Bsub_dataset/data_with_embeddings.jsonl.gz", lines=True, compression='gzip')
#df["MAX_MEM_USAGE_MB"] = pd.to_numeric(df["MAX_MEM_USAGE_MB"], errors = "coerce") # Convert to numeric, set errors to coerce to replace invalid parsing with NaN
#df = df[df["MAX_MEM_USAGE_MB"].notnull()].reset_index(drop=True)
#df["NUM_EXEC_PROCS"] = pd.to_numeric(df["NUM_EXEC_PROCS"], errors = "coerce")
#df = df[df["NUM_EXEC_PROCS"].notnull()].reset_index(drop=True)
print(df.head(5))

In [None]:
## Creating a Pytorch Dataset class for the embeddings

class CommandDataset(Dataset):
    def __init__(self, df):
        """
        Args: 
            df(pd.DataFrame): DataFrame containing entire dataset with embeddings
        """
        self.commands = df["Command"]
        '''

        For efficiency, we convert the columns with numbers to tensors of float32 type.

        '''
        self.memory_requested = torch.tensor(df["MEM_REQUESTED_MB"].values, dtype=torch.float32)
        self.memory_used = torch.tensor(df["MAX_MEM_USAGE_MB"].values, dtype=torch.float32)
        self.num_of_processors = torch.tensor(df["NUM_EXEC_PROCS"].values,dtype=torch.float32)
        '''

        Assuming we have tensor of embeddings

        ''' 
        #print(df["Embeddings"][0])
        self.embeddings = df["Embeddings"]
        


        print(f"Initialized CommandDataset with {len(self)}")
        
    def __len__(self):
        '''

        Returns the length in the dataset/ number of commands

        '''
        return len(self.commands)
    
    def __getitem__(self, idx):
        '''

        Returns the item at the given index

        Args:
            idx(int): index of the item to return
        Returns:
            dict: Dictionary containing the command, memory requested, memory used, number of processors, and embeddings

        '''
        commands = self.commands.iloc[idx]
        memory_requested = self.memory_requested[idx]
        memory_used = self.memory_used[idx]
        num_of_processors = self.num_of_processors[idx]
        embeddings = self.embeddings[idx]

        data = {
            "command": commands,
            "memory_requested": memory_requested,
            "memory_used": memory_used,
            "num_of_processors": num_of_processors,
            "embeddings": embeddings
        }

        return data
        

In [None]:
print(df["MAX_MEM_USAGE_MB"])
print(df.columns)
print(df['MAX_MEM_USAGE_MB'].describe())

In [None]:
data = CommandDataset(df_balanced)
print(data[0])

In [None]:
def split_dataset_with_labels(dataset, test_size=0.2):
    # Calculate the number of samples for train and test
    train_size = int(len(dataset) * (1 - test_size))
    test_size = len(dataset) - train_size
    
    # Perform the random split
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    print(f"Dataset split into {len(train_dataset)} training samples and {len(test_dataset)} testing samples")
    return train_dataset, test_dataset

# Split the dataset
train_dataset, test_dataset = split_dataset_with_labels(data, test_size=0.2)

# Print the object types to verify
print(type(train_dataset))
print(type(test_dataset))


In [None]:
# Create a DataLoader for the training set
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
# Create a DataLoader for the testing set
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

In [None]:
# Train a scikit-learn model
from sklearn.linear_model import LinearRegression

# extract the embeddings and memory used from the training set
X_train = torch.cat([batch["embeddings"] for batch in train_loader]).squeeze(1)
print(X_train.shape)
Y_train = torch.cat([batch["memory_used"] for batch in train_loader])
print(Y_train.shape)

#train the model
sk_model = LinearRegression()
sk_model.fit(X_train.numpy(), Y_train.numpy())  # Convert tensors to numpy arrays

#get weights and bias
sk_weights = sk_model.coef_
sk_bias = sk_model.intercept_


In [None]:
#prediction on the training set
Y_pred = sk_model.predict(X_train.numpy())
print(Y_pred[:10])
print(Y_train[:10])
# Calculate the Mean Squared Error
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(Y_train.numpy(), Y_pred)
print(f"Training MSE: {mse_train}")
plt.figure(figsize=(8, 6))
plt.scatter(Y_train, Y_pred, alpha=0.6, color='teal', edgecolor='k')
plt.xlabel("Actual Memory Used")
plt.ylabel("Predicted Memory Used")
plt.title("Sklearn Linear Regression: Predictions vs Actual on Training Set")
plt.tight_layout()
plt.show()

print(Y_train.mean())

print(Y_pred.mean())

In [None]:
all_batches = [batch["embeddings"] for batch in train_loader]
print(all_batches[0].shape)  # Access the shape of the first tensor directly

batches = torch.cat(all_batches).squeeze(1)  # Squeeze the second dimension after concatenation
print(batches.shape)

In [None]:
for i in range(len(data)):
    embedding_size = data[i]["embeddings"].size(1)
    print(f"Embedding size for row {i}: {embedding_size}")

In [None]:
#Define a Linear Regression Model

class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim,output_dim)
    
    def forward(self, x):
        pred_y = self.linear(x)
        return pred_y

In [None]:
#Instantiate the model
input_dim = data[0]["embeddings"].size(1)
output_dim = 1
model = LinearRegression(input_dim,output_dim)

#Define the loss function
criterion = nn.MSELoss()

#Define the optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate)

In [None]:
#assign weights and bias to the model
model.linear.weight.data = torch.tensor(sk_weights, dtype=torch.float32).unsqueeze(0)
model.linear.bias.data = torch.tensor(sk_bias, dtype=torch.float32)

In [None]:
#Training the model
model.train()
losses = []
for epoch in range(50):
    for batch in train_loader:
        #Forward pass
        # Reshape embeddings to ensure they are 2D
        pred_y = model(batch["embeddings"])

        #Loss calculation
        loss = criterion(pred_y, batch["memory_used"])
        #append the loss
        losses.append(loss.item())

        #Backward pass
        #Clear the gradients
        optimizer.zero_grad()
        loss.backward()
        
        #Update the weights
        optimizer.step()
        print (f" Epoch { epoch} Loss: {loss.item()}")


In [None]:
# Initialize the model, loss function, and optimizer
input_dim = data[0]["embeddings"].size(1) # Size of the embeddings # getting the number of input features 
output_dim = 1 # asking for a single output value (memory used)
model = LinearRegression(input_dim, output_dim) # model is an instance of the LinearRegression class with the input and output dimensions
#Instantiate the loss function
criterion = nn.MSELoss()  # Mean Squared Error 

#Instantiate the optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


In [None]:
model.train()   # Set the model to training mode
loss_list = []  # store all the calculated loss values
for epoch in range(500):   # 500 epochs
    for batch in train_loader:   

        #print(batch.keys())
        # Forward pass: Compute predicted y by passing 
        # x to the model
        pred_y = model(batch["embeddings"])      #calls the forward method of the model(an instance of the LinearRegression class)
    
        # Compute  loss
        loss = criterion(pred_y, batch["memory_used"])  # criterion is an instance of the MSELoss class
        # storing the calculated loss in a list
        loss_list.append(loss.item())
    
        # Zero gradients, perform a backward pass, 
        # and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('epoch {}, loss {}'.format(epoch, loss.item()))

In [None]:
print(model.linear.weight.grad)  # View gradients before zeroing
optimizer.zero_grad()
print(model.linear.weight.grad) 

In [None]:
#Get the weights and bias
weights = model.linear.weight.data
bias = model.linear.bias.data
print(f"Weights: {weights}")
print(f"Bias: {bias}")

In [None]:
for name, param in model.named_parameters():
    print(f"After step - {name}: {param.grad}")

In [None]:
# save the model
torch.save(model.state_dict(), '/Users/dn10/Downloads/Bsub_dataset/model.pth') # pytorch function like pickle which saves an object to a file
print(model.state_dict())

In [None]:
import numpy 
# Load the saved model
model.load_state_dict(torch.load('/Users/dn10/Downloads/Bsub_dataset/model.pth'))
# Set the model to evaluation mode
model.eval()

# Initialize the loss function
criterion = nn.MSELoss()  

# Evaluate the model on the test set
test_loss = 0
total_predictions = numpy.array([])
total_actual = numpy.array([])

with torch.no_grad():  # Disable gradient calculation
    for batch in test_loader:
        # Forward pass: Compute predicted y by passing x to the model
        pred_y = model(batch["embeddings"])

        
        # Compute loss
        loss = criterion(pred_y, batch["memory_used"])
        
        # Accumulate the loss
        test_loss += loss.item()
        pred_y = pred_y.squeeze()
        print(f"Predicted: {pred_y}, Actual: {batch['memory_used']}")
        print(pred_y.shape)
        print(batch['memory_used'].shape)
        total_predictions = numpy.append(total_predictions, pred_y.numpy())
        total_actual = numpy.append(total_actual,batch['memory_used'].numpy())
# Average test loss
test_loss /= len(test_loader)
print(f'Test Loss: {test_loss:.4f}')
print(total_predictions)
print(total_actual)

In [None]:

# Plot a scatter plot for total prediction and total actual
plt.scatter(total_predictions, total_actual, alpha=0.5, s= 20)
plt.xlabel('Predicted memory used')
plt.ylabel('Actual memory used')
plt.title('Predictions vs Actual')
plt.show()