In [None]:
%pip install transformers torch 

In [None]:

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pickle
import matplotlib.pyplot as plt


In [None]:
#from sentence_transformers import SentenceTransformer

import os

# Path to input and output
input_path = "/Users/dn10/Downloads/Bsub_dataset/data.jsonl.gz"
output_path = "/Users/dn10/Downloads/Bsub_dataset/filtered_under_5GB.jsonl"

# Target size limit (in bytes) - 5GB
size_limit = 5 * 1024 * 1024 * 1024  # 5 GB

# Open output file for writing
with open(output_path, "w") as output_file:
    for chunk in pd.read_json(input_path, 
                              lines=True, 
                              compression='gzip', 
                              chunksize=100000):

        # Filter rows that contain #BSUB in 'Command' column
        filtered_chunk = chunk[chunk['Command'].str.contains('#BSUB', case=False, na=False)]

        # Write to file in JSONL format
        filtered_chunk.to_json(output_file, orient='records', lines=True)

        # Check file size after writing
        current_size = os.path.getsize(output_path)
        print(f"Written so far: {round(current_size / (1024 ** 2), 2)} MB")

        if current_size >= size_limit:
            print("File size limit reached. Stopping.")
            break


In [None]:
df = pd.read_json(output_path, lines=True)
len(df)

In [None]:
df.head(5)

In [None]:
df.info()
df.describe()
df.isna().sum()

In [None]:
#%pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x=df['MAX_MEM_USAGE_MB'])
plt.xscale('log')  # Optional: helps if values vary a lot
plt.title("Boxplot of Memory Usage")
plt.show()

In [None]:
df['MAX_MEM_USAGE_MB'].quantile([0.25, 0.5, 0.75, 0.90, 0.95, 0.99, 0.999])


In [None]:
def preprocess_data(df, column= "MAX_MEM_USAGE_MB", min_mem_mb = 1.0, quantile = 0.99, bins =100, samples_per_bin=1000, random_state=42):
    """
    Preprocess the data by filtering and binning.
    
    Args:
        df (pd.DataFrame): DataFrame containing the data.
        column (str): Column name to filter on.
        min_mem_mb (float): Minimum memory usage in MB.
        quantile (float): Quantile to filter on.
        bins (int): Number of bins for histogram.
        samples_per_bin (int): Number of samples per bin.
        random_state (int): Random state for reproducibility.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    # Filter jobs with low memory
    df = df[df[column] >= min_mem_mb].copy()
    
    # Calculate the upper bound using quantile
    upper_bound = df[column].quantile(quantile)
    
    # Further filter rows based on the upper bound
    df = df[df[column] <= upper_bound].reset_index(drop=True)
    
    # Bin the data
    df['bin'] = pd.cut(df[column], bins=bins, duplicates='drop')
    
    # Sample from each bin
    sampled_df = df.groupby('bin').apply(lambda x: x.sample(min(len(x), samples_per_bin), random_state=random_state))
    
    # drop the bin column
    sampled_df = sampled_df.reset_index(drop=True)
    sampled_df = sampled_df.drop(columns=['bin'])
    # Reset index
    sampled_df = sampled_df.reset_index(drop=True)
    
    return sampled_df

In [None]:
df_balanced = preprocess_data(df, column="MAX_MEM_USAGE_MB", min_mem_mb=1.0, quantile=0.99, bins=100, samples_per_bin=1000, random_state=42)
print(f"Number of rows after preprocessing: {len(df_balanced)}")
df_balanced.head(5)

In [None]:
df_balanced['MAX_MEM_USAGE_MB'].hist(bins=50, edgecolor='black')
#groupby;sampling;reformatting.
plt.show()

In [None]:
len(df_balanced)

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2')
encoded_input = tokenizer(df['Command'][0], padding=True, truncation=True, return_tensors='pt')
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
print(encoded_input[0])




In [None]:
# Tokenize each command in the DataFrame
def get_embedding(command):
    encoded_input = tokenizer(command, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform mean pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    # Return the sentence embedding (flattened to 1D tensor)
    return embeddings

In [None]:
# Apply the embedding function to each row in the DataFrame
df_balanced["Embeddings"] = df_balanced["Command"].apply(lambda x: get_embedding(x))


In [None]:
import pickle
# Storing df_balanced with embeddings in a pickle file
with open('df_embeddings.pickle', 'wb') as file:
    pickle.dump(df_balanced, file)



In [None]:
with open ('df_embeddings.pickle', 'rb') as file:
    df_balanced = pickle.load(file)

In [None]:
print(df_balanced.head(5))

In [None]:
df_balanced.MAX_MEM_USAGE_MB.median()

In [None]:
(df_balanced['MAX_MEM_USAGE_MB'] > 10000).sum()

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def prepare_data(df, test_size=0.2, random_state=42):
    X = np.array([np.array([tensor.item() for tensor in emb[0]]) for emb in df["Embeddings"]])
    y = np.log1p(df["MAX_MEM_USAGE_MB"].values)
    
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train, X_test, y_train, y_test = prepare_data(df_balanced, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

In [None]:
from sklearn.linear_model import LinearRegression

def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model
model = train_model(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, root_mean_squared_error
def evaluate_model(model, X, y, dataset_label=""):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred) 

    print(f"{dataset_label} R²: {r2:.4f}")
    print(f"{dataset_label} RMSE: {rmse:.4f}")

    return y_pred
y_train_pred = evaluate_model(model, X_train, y_train, "Train")
y_test_pred = evaluate_model(model, X_test, y_test, "Test")

In [None]:
def plot_results(y_true, y_pred, label=""):
    plt.figure(figsize=(6, 4))
    plt.scatter(y_true, y_pred, alpha=0.3, label=label)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label='Ideal')
    plt.xlabel("Actual (log)")
    plt.ylabel("Predicted (log)")
    plt.title(f"{label} - Predicted vs Actual")
    plt.legend()
    plt.tight_layout()
    plt.show()
plot_results(y_train, y_train_pred, "Train")
plot_results(y_test, y_test_pred, "Test")

In [None]:
print("\nSample predictions (in MB):")
print("Predicted:", np.round(np.expm1(y_test_pred[:5]), 2))
print("Actual:   ", np.round(np.expm1(y_test[:5]), 2))