In [None]:
import os
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle as pkl

In [None]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import torch.nn.init as init
import json
import pickle
import pandas as pd

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/tokenizer/')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('../input/token-ids-pkl-2/ids (1).pkl','rb') as f:
    input_ids = torch.tensor(pkl.load(f)).tolist()

In [None]:
import gc
gc.collect()

In [None]:
token_list = []
for i in tqdm(input_ids[:len(input_ids)]):
    token_list.extend(i)

gc.collect()
len(token_list)

In [None]:
df = pd.DataFrame(columns=["input_ids"])
df

In [None]:
context_len = 64      ## Taking less because I have less data
token_batch = []
for i in tqdm(range(0,len(token_list),context_len)):
  token_batch.append(token_list[i:i+context_len])
  # token_list = token_list[context_len:]
len(token_batch[-1])

In [None]:
print(len(token_batch))
print(len(token_batch[0]))

In [None]:
df["input_ids"] = token_batch
df

In [None]:
attn_mask = [[1]*64]*len(df)
df["attention_mask"] = attn_mask
df['labels'] = df['input_ids']
df.head()

In [None]:
# !pip install datasets
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import pandas as pd

In [None]:
# import torch_xla
# import torch_xla.core.xla_model as xm


In [None]:
# hf_dataset = Dataset.from_pandas(df[:1000])
# split_dataset = hf_dataset.train_test_split(test_size=0.1)  # Adjust test_size as needed

# train_dataset = split_dataset['train']
# eval_dataset = split_dataset['test']

# Assuming df is your original DataFrame
max_len = len(df)
df2 = df[:max_len]
train_size = int(0.9 * len(df2))  # Calculate 90% of the dataset length

# Split the DataFrame
train_df = df2[:train_size]  # First 90% for training
eval_df = df2[train_size:]   # Remaining 10% for evaluation
print('split done')
# Convert each split to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
print('train converted')
eval_dataset = Dataset.from_pandas(eval_df)
print('test converted')

In [None]:
train_dataset

In [None]:
train_dataset.to_parquet("hi_dataset_token_train.parquet")
eval_dataset.to_parquet("hi_dataset_token_test.parquet")

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
# config = LlamaConfig(hidden_size=256,
#                      vocab_size=len(tokenizer.vocab),
#                      num_attention_heads=4,
#                      num_key_value_heads=2,
#                      num_hidden_layers=12,
#                      intermediate_size=688,
#                      max_position_embeddings=64)

config = LlamaConfig(hidden_size=768,
                     vocab_size=32000,
                     num_attention_heads=8,
                     num_key_value_heads=2,
                     num_hidden_layers=8,
                     intermediate_size=1024,
                     max_position_embeddings=64)

print(config)
model_mis = LlamaForCausalLM(config)

# Move model to TPU
model_mis.to(device)

for i,j in model_mis.named_parameters():
  if j.requires_grad and len(j.size()) > 1:
    init.xavier_uniform_(j.data)

total_param=0
for i,j in model_mis.named_parameters():
    total_param += j.numel()
print(total_param/(10**6))

In [None]:
training_args = TrainingArguments(
    output_dir="./hi_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    logging_steps=500,
    learning_rate=2e-3,
    fp16=True,
    do_train=True,
    per_device_train_batch_size=64,
    save_steps=20000,
    save_total_limit=2,
    report_to="none",
)

In [None]:
trainer = Trainer(
    model=model_mis,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.empty_cache()

In [None]:
# torch.cuda.empty_cache()
trainer.train()

In [None]:
# Save training metrics to a JSON file
metrics = train_output.metrics
with open("training_metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

# Optionally, evaluate and save evaluation metrics
eval_metrics = trainer.evaluate()
with open("eval_metrics.json", "w") as f:
    json.dump(eval_metrics, f, indent=4)

In [None]:
custom_input = "जब मैंने उसे देखा तो वह मंदिर जा रहा था"
input_dict = {'text': [custom_input]}
input_dict = {'input_ids': [tokenizer.encode(custom_input)]}
input_dict

In [None]:
custom_dataset = Dataset.from_dict(input_dict)
predictions = trainer.predict(custom_dataset)
generated_outputs = predictions.predictions  # This will be logits
output_ids = torch.argmax(torch.tensor(generated_outputs), dim=2)

In [None]:
tokenizer.decode(output_ids[0])

In [None]:
output_ids

In [None]:
import math
model = AutoModelForCausalLM.from_pretrained('trained_model')
def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = math.exp(loss.item())

    return perplexity

text = "जब मैंने उसे देखा तो वह मंदिर जा रहा था"
perplexity = calculate_perplexity(text)
print(f"Perplexity: {perplexity}")

In [None]:
trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")