In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import torch
import torch.nn as nn
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
sys.path.append('..')
import pickle
from typing import NamedTuple, Literal
import matplotlib.pyplot as plt
import os

In [1]:
class LLMPRegressionDescription(NamedTuple):
    x_context: torch.Tensor
    y_context: torch.Tensor
    x_target: torch.Tensor
    y_target: torch.Tensor
    knowledge: list[str] | tuple[str] | None
    num_total_points: int
    num_context_points: int

class TempData:
    def __init__(self, data: pd.DataFrame, max_num_context: int, device, random_state=42):
        self.data = data
        self.max_num_context = max_num_context
        print(data)
        # x_values = np.linspace(0, 2355, 288)
        # x_values = data.iloc[0][5:].values.astype('float32') # (288,)
        x_values = torch.from_numpy(data.columns.values[3:].astype(int)).unsqueeze(0)
        print(f"x_values= {x_values}")
        # assert x_values.shape[0] == 288
        self.x_values = x_values #torch.linspace(-2, 2, len(x_values), device=device).unsqueeze(0)
        
        y_values = data.iloc[1:, 3:].values
        y_desc = data.iloc[1:, 2].values
        y_values_train, y_values_temp, y_desc_train, y_desc_temp = train_test_split(
            y_values, y_desc, test_size=0.3, random_state=random_state
        )
        y_values_val, y_values_test, y_desc_val, y_desc_test = train_test_split(
            y_values_temp, y_desc_temp, test_size=0.5, random_state=random_state
        )

        self.y_values_train = torch.tensor(y_values_train).float().to(device)
        self.y_values_val = torch.tensor(y_values_val).float().to(device)
        self.y_values_test = torch.tensor(y_values_test).float().to(device)
        self.y_desc_train = y_desc_train
        self.y_desc_val = y_desc_val
        self.y_desc_test = y_desc_test

    def generate_batch(self, 
                       batch_size: int,
                       split: Literal['train', 'val', 'test'],
                       device: torch.device = torch.device('cpu'),
                       return_knowledge: bool = False,
                       num_context: None | int = None
                       ) -> LLMPRegressionDescription:
        # num_total_points = self.x_values.size(-1)
        num_total_points = 288
        if num_context is None:
            num_context = np.random.randint(low=1, high=self.max_num_context)
        else:
            assert isinstance(num_context, int) 
        num_target = num_total_points  # Using all points as target

        if split == 'train':
            selected_indices = np.random.choice(self.y_values_train.size(0), batch_size, replace=False)
            selected_y_values = self.y_values_train[selected_indices]  # Shape: [batch_size, num_points]

            knowledge = self.y_desc_train[selected_indices]
        elif split == 'val':
            selected_indices = np.random.choice(self.y_values_val.size(0), batch_size, replace=False)
            selected_y_values = self.y_values_val[selected_indices]  # Shape: [batch_size, num_points]

            knowledge = self.y_desc_val[selected_indices]

        elif split == 'test':
            selected_indices = np.random.choice(self.y_values_test.size(0), batch_size, replace=False)
            selected_y_values = self.y_values_test[selected_indices]  # Shape: [batch_size, num_points]

            knowledge = self.y_desc_test[selected_indices]
        else:
            raise ValueError("split must be one of ['train', 'val', 'test']")
        # Split into context and target sets
        context_indices = np.random.choice(num_total_points // 2, num_context, replace=False)

        x_context = self.x_values[:, context_indices].repeat(batch_size, 1)  # Shape: [batch_size, num_context]
        y_context = selected_y_values[:, context_indices]  # Shape: [batch_size, num_context]

        x_target = self.x_values.repeat(batch_size, 1)  # Shape: [batch_size, num_target]
        y_target = selected_y_values  # Shape: [batch_size, num_target]
        
        if return_knowledge:
            
            return LLMPRegressionDescription(
                x_context=x_context.unsqueeze(-1).to(device),  # Shape: [batch_size, num_context, x_size]
                y_context=y_context.unsqueeze(-1).to(device),  # Shape: [batch_size, num_context, y_size]
                x_target=x_target.unsqueeze(-1).to(device),    # Shape: [batch_size, num_target, x_size]
                y_target=y_target.unsqueeze(-1).to(device),    # Shape: [batch_size, num_target, y_size]
                knowledge=list(knowledge), # Shape/type: TODO
                num_total_points=num_total_points,
                num_context_points=num_context
            )

        else:
            
            return LLMPRegressionDescription(
                x_context=x_context.unsqueeze(-1).to(device),  # Shape: [batch_size, num_context, x_size]
                y_context=y_context.unsqueeze(-1).to(device),  # Shape: [batch_size, num_context, y_size]
                x_target=x_target.unsqueeze(-1).to(device),    # Shape: [batch_size, num_target, x_size]
                y_target=y_target.unsqueeze(-1).to(device),    # Shape: [batch_size, num_target, y_size]
                knowledge=None,
                num_total_points=num_total_points,
                num_context_points=num_context
            )

NameError: name 'NamedTuple' is not defined

In [None]:
def process_and_save_data(file_path, pkl_output_path, max_num_context=20, device='cpu', random_state=47, return_knowledge=False):

    data = pd.read_csv(file_path)
    
    temp_data = TempData(data=data, max_num_context=max_num_context, device=device, random_state=random_state)

    batch_size = 1
    num_context = 10
    test_num = 288
    
    llmp_description = temp_data.generate_batch(batch_size=batch_size, split='train', device=device, return_knowledge=return_knowledge, num_context=num_context)

    data_to_save = {
        'x_train': llmp_description.x_context.cpu().numpy().flatten(),
        'y_train': llmp_description.y_context.cpu().numpy().flatten(),
        'x_test': llmp_description.x_target.cpu().numpy().flatten()[144::6],
        'y_test': llmp_description.y_target.cpu().numpy().flatten()[144::6],
        'x_true': llmp_description.x_target.cpu().numpy().flatten(),
        'y_true': llmp_description.y_target.cpu().numpy().flatten(),
    }
    
    with open(pkl_output_path, 'wb') as handle:
        pickle.dump(data_to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    knowledge = None
    if return_knowledge:
        knowledge = llmp_description.knowledge
    
    return {
        'data': data_to_save,
        'knowledge': knowledge
    }

# Example usage
result = process_and_save_data('../data/kasia/data_with_desc.csv', 'random_sample_output_data.pkl', return_knowledge=True)

In [None]:
from src.plot import plot_samples, plot_images, plot_heatmap
from src.hf_api import get_model_and_tokenizer
from src.parse_args import parse_command_line
from src.compute_nll import compute_nll
from src.sample import sample
from src.prepare_data import prepare_data


In [None]:
from types import SimpleNamespace

args = SimpleNamespace(
    cfg=None,  # Assuming `ActionConfigFile` is a custom action, replace with the appropriate default if needed.
    mode='sample_logpy',
    experiment_name='test',
    # data_path='../data/functions/linear_25_seed_6.pkl',
    data_path='random_sample_output_data.pkl',
    llm_path=None,
    llm_type= "llama-2-7B", # "llama-2-70B",
    prompt_ordering='distance',
    output_dir='./output',
    plot_dir='./plots',
    seed=1,
    num_decimal_places_x=0,
    num_decimal_places_y=2,
    batch_size=5,
    autoregressive=True,
    prefix= 'sinusoidally cold, then hot',
    x_prefix='',
    y_prefix=', ',
    break_str='\n',
    sort_x_test=False,
    forecast=True,
    print_prompts=False,
    print_logprobs=False,
    num_samples=10,
    temperature=1.0,
    top_p=0.9,
    max_generated_length=7,
    y_min=None,
    y_max=None,
    plot_trajectories=5,
    specify_xy=False,
    xs=None,  # Assuming default is None when no arguments are provided
    ys=None,  # Assuming default is None when no arguments are provided
    xs_start=None,
    xs_end=None,
    num_xs=None,
    ys_start=None,
    ys_end=None,
    num_ys=None,
    mask_unused_tokens=True
)

In [6]:
import os
os.environ['HF_HOME'] = '/workspace/will/LLMP/hf_cache/' 
os.environ['HF_HUB_CACHE'] = '/workspace/will/LLMP/hf_cache/'

In [7]:
llm_map = {
    # "llama-2-7B": "meta-llama/Llama-2-7b",  Llama-2-7b-hf
    "llama-2-7B": "meta-llama/Llama-2-7b-chat-hf",
    
    "llama-2-70B": "meta-llama/Llama-2-70b-hf",
    # "llama-3-8B": "meta-llama/Meta-Llama-3-8B", meta-llama/Meta-Llama-3.1-8B
    "llama-3-8B": "meta-llama/Meta-Llama-3.1-8B", 
    # "llama-3-70B": "meta-llama/Meta-Llama-3-70B", meta-llama/Meta-Llama-3.1-70B
    "llama-3-70B": "meta-llama/Meta-Llama-3.1-70B",
    "mixtral-8x7B": "mistralai/Mixtral-8x7B-v0.1",
    "mixtral-8x7B-instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "phi-3-mini-128k-instruct": "microsoft/Phi-3-mini-128k-instruct",
}

In [None]:
# mistral_token = "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"
# llama_token = "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"
llama_chat_token = "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"
# llama3_token = "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"
# llama2_70b = "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"
# llama3_70b = "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"
# get the llm and asociated tokenizer
model, tokenizer = get_model_and_tokenizer(args.llm_path, args.llm_type, llama_chat_token)
# Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B", token=llama3_token, cache_dir=)
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B", token=llama3_token)

In [9]:
from accelerate import infer_auto_device_map

device_map = infer_auto_device_map(model, max_memory={0: "80GiB", 1: "80GiB",  "cpu": "250GiB"})
print(device_map)

OrderedDict({'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0, 'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0, 'model.layers.24': 0, 'model.layers.25': 0, 'model.layers.26': 0, 'model.layers.27': 0, 'model.layers.28': 0, 'model.layers.29': 0, 'model.layers.30': 0, 'model.layers.31': 0, 'model.layers.32': 0, 'model.layers.33': 0, 'model.layers.34': 0, 'model.layers.35': 0, 'model.layers.36': 0, 'model.layers.37': 0, 'model.layers.38': 0, 'model.layers.39': 0, 'model.layers.40': 0, 'model.layers.41': 0, 'model.layers.42': 0, 'model.layers.43': 0, 'mod

In [10]:
'''from src.run_llm_process import run_llm_process
run_llm_process(args=args, model=model, tokenizer=tokenizer)'''

'from src.run_llm_process import run_llm_process\nrun_llm_process(args=args, model=model, tokenizer=tokenizer)'

In [11]:
import random
from src.run_llm_process import run_llm_process

plt.style.use('./thesis.mplstyle')

def run_training_loop(data_file, output_dir, model, tokenizer, total_number=4):
    input_data = pd.read_csv(data_file)

    # Shuffle the indices and select a subset of them
    indices = list(input_data.index)
    random.shuffle(indices)
    selected_indices = indices[:total_number]

    for index in selected_indices:
        row = input_data.loc[index]
        pkl_output_path = os.path.join(output_dir, f'sample_{index}.pkl')
        
        # Process the data row and save it
        result = process_and_save_data(data_file, pkl_output_path, return_knowledge=True, random_state=random.randint(0, 10000))
        print(f"Processing row {index}:")
        print(result['knowledge'])

        args = SimpleNamespace(
            cfg=None,
            mode='sample_logpy',
            experiment_name='test',
            data_path=pkl_output_path,
            llm_path=None,
            llm_type="llama2_70b",
            prompt_ordering='distance',
            output_dir='./output',
            plot_dir='./plots',
            seed=1,
            num_decimal_places_x=0,
            num_decimal_places_y=1,
            batch_size=1,
            autoregressive=True,
            prefix= result['knowledge'], # change to knowledge # Predict the temperature throughout the day (mins, temp): 
            x_prefix='',
            y_prefix=', ',
            break_str='\n',
            sort_x_test=False,
            forecast=True,
            print_prompts=False,
            print_logprobs=False,
            num_samples=20,
            temperature=1.0,
            top_p=0.9,
            max_generated_length=5,
            y_min=None,
            y_max=None,
            plot_trajectories=0,
            specify_xy=False,
            xs=None,
            ys=None,
            xs_start=None,
            xs_end=None,
            num_xs=None,
            ys_start=None,
            ys_end=None,
            num_ys=None,
            mask_unused_tokens=True
        )

        run_llm_process(args=args, model=model, tokenizer=tokenizer)
        plt.show()

# Example call to the function
run_training_loop('../data/kasia/data_with_desc.csv', './loop_output_dir', model, tokenizer)

     LST_DATE  LST_DATE.1                                        description  \
0    20210101    20210101  The night will start off very chilly at around...   
1    20210102    20210102  The night will start off cold with temperature...   
2    20210103    20210103  The night will start off cold at -16.4°C and g...   
3    20210104    20210104  The night will start bitterly cold with temper...   
4    20210105    20210105  The night will be bitterly cold with temperatu...   
..        ...         ...                                                ...   
720  20221226    20221226  The night will be bitterly cold with temperatu...   
721  20221227    20221227  The night will start off cold with temperature...   
722  20221228    20221228  It's going to be a chilly day with temperature...   
723  20221229    20221229  The night will start off cold with temperature...   
724  20221230    20221230  The night will start off cold with temperature...   

        0     5    10    15    20    25

Sampling: 2it [11:13, 336.84s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 11.59 GiB. GPU 1 has a total capacity of 93.00 GiB of which 11.46 GiB is free. Process 3373751 has 81.54 GiB memory in use. Of the allocated memory 75.61 GiB is allocated by PyTorch, and 5.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import os
cwd = os.getcwd()
cwd

In [None]:
with open('../data/functions/beat_05_seed_9.pkl','rb') as f:
     contents = pickle.load(f)
# '../data/weather/weather_llm_proc_10.pkl'

In [None]:
print(contents)