In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dotenv import load_dotenv
import os
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from nltk import sent_tokenize
import nltk 

from vllm import LLM, SamplingParams
import os

nltk.download('punkt')

In [1]:
import pandas as pd
df = pd.read_parquet('data/train_dataset_v1.parquet')

In [2]:
df

Unnamed: 0,text,generated
0,\nWhen considering the pros and cons of attend...,1.0
1,\nBefore making any decisions about getting in...,1.0
2,"\nRalph Waldo Emerson once said, ""Go confident...",1.0
3,\nHuman character traits are shaped by a wide ...,1.0
4,\nOnline classes have been increasingly popula...,1.0
...,...,...
11044,“In the old tunnel that runs from this place n...,0.0
11045,"'I have thought of something, dearest. Do as y...",0.0
11046,“‘That’s my brave girl. It’s better worth bein...,0.0
11047,“There’s something ever egotistical in mountai...,0.0


In [None]:
df = pd.read_parquet('data/slimpajama.parquet')

model = 'upstage/SOLAR-10.7B-Instruct-v1.0'
output_folder = f'data/generated/{model}'
os.makedirs(output_folder, exist_ok=True)

In [None]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt')

folders = os.listdir('data/generated/')
folders = [f for f in folders if 'pajama_gen' in f and '.' not in f]

generated_ids = {}
for folder in folders:
    fns = os.listdir(folder)
    fns = [fn for fn in fns if fn.startswith('text_')]
    for fn in fns:
        fn = fn.split('_')[1].split('.')[0]
        generated_ids[fn] = True
        
df['already_generated'] = df['generated_id'].apply(lambda x: generated_ids.get(x, False))
df = df[~df['already_generated']]

remaining_indexes = df.generated_id.values
np.random.shuffle(remaining_indexes)

print(len(remaining_indexes))

In [None]:
llm = LLM(model=model, dtype=torch.float16)

In [None]:

for i in tqdm(range(0, len(remaining_indexes), 128)):
    prompts = []
    ids = []
    texts = []
    for j in range(i, i+128):
        id = remaining_indexes[j]
        row = df[df['generated_id'] == id]
        if os.path.isfile(f'{output_folder}/text_{id}.txt'):
            continue

        text = row.text.values[0]
        text_len = row.text_len.values[0]
        min_text_len = 60
        max_text_len = 150

        text = ' '.join(text.split(' ')[:np.random.randint(min_text_len, max_text_len)])
        prompt = f'Continue following text, generate response that is 500-600 words long: {text}'

        prompts.append(prompt)
        ids.append(id)
        texts.append(text)
        

    max_tokens = [600, 700, 800, 900,]
    
    mid_temp_range = (0.8, 1)
    high_temp_range = (1.2, 1.5)
    
    p_range = (0.8, 0.95)
    
    mid_mp_range = (0.01, 0.02)
    high_mp_range = (0.05, 0.07)
    
    repetition_range = (1, 1.05)
    frequency_range = (0, 0.1)
    presence_range = (0, 1)

    sampling_params = np.random.choice([
        SamplingParams(
            temperature=np.random.uniform(mid_temp_range[0], mid_temp_range[1]), 
            top_p=np.random.uniform(p_range[0], p_range[1]), 
            max_tokens=np.random.choice(max_tokens),
            repetition_penalty=np.random.uniform(repetition_range[0], repetition_range[1]), 
            presence_penalty=np.random.uniform(presence_range[0], presence_range[1])
        ),
        SamplingParams(
            temperature=np.random.uniform(mid_temp_range[0], mid_temp_range[1]), 
            top_p=np.random.uniform(p_range[0], p_range[1]), 
            max_tokens=np.random.choice(max_tokens),
            frequency_penalty=np.random.uniform(frequency_range[0], frequency_range[1]), 
            presence_penalty=np.random.uniform(presence_range[0], presence_range[1])
        ),
        
        SamplingParams(
            temperature=np.random.uniform(high_temp_range[0], high_temp_range[1]), 
            min_p=np.random.uniform(high_mp_range[0], high_mp_range[1]),
            max_tokens=np.random.choice(max_tokens),
            repetition_penalty=np.random.uniform(repetition_range[0], repetition_range[1]), 
            presence_penalty=np.random.uniform(presence_range[0], presence_range[1])
        ),
        SamplingParams(
            temperature=np.random.uniform(high_temp_range[0], high_temp_range[1]), 
            min_p=np.random.uniform(high_mp_range[0], high_mp_range[1]),
            max_tokens=np.random.choice(max_tokens),
            frequency_penalty=np.random.uniform(frequency_range[0], frequency_range[1]), 
            presence_penalty=np.random.uniform(presence_range[0], presence_range[1])
        ),
    ])
        
    outputs = llm.generate(prompts, sampling_params)
    for id, output, text in zip(ids, outputs, texts):
        prompt = output.prompt
        generated_text = output.outputs[0].text
        
        new_text = text.strip() + ' ' + generated_text.strip()
        new_text = new_text.replace('### Answer:', '').replace('###Answer:', '')
        
        with open(f'{output_folder}/text_{id}.txt', 'w') as file:
            file.write(new_text)
        