In [1]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
import datasets
train_df = datasets.load_from_disk("data/train")
test_df = datasets.load_from_disk("data/test")
validate_df = datasets.load_from_disk("data/validate")
validate_decomposed = datasets.load_from_disk("data/decomposed_validate")

In [3]:
import pandas as pd

def to_pandas(item):
  return pd.DataFrame(item['table']["rows"],columns=item['table']["header"])


item = train_df[919]
to_pandas(item)

Unnamed: 0,Class,Type,Fleet Numbers,Quantity Made,Date Made,1958 Cié,1958 Uta,Date Withdrawn
0,Pp,4 - 4 - 0,"12 , 25 , 42 - 46 , 50 , 70 - 71 , 74 - 77 , 1...",17,1896 - 1911,5,7,1957 - 1963
1,Pg,0 - 6 - 0,"10 - 11 , 78 , 100 - 103",7,1899 - 1904,0,7,1960 - 1964
2,Q,4 - 4 - 0,"120 - 125 , 130 - 136",13,1899 - 1904,5,4,1951 - 1963
3,Qg,0 - 6 - 0,152 - 155,4,1903 - 1904,4,0,1962 - 1963
4,P,4 - 4 - 0,"88 - 89 , 104 - 105",4,1904 - 1906,1,0,1956 - 1960
5,Ql,4 - 4 - 0,"24 , 113 - 114 , 126 - 128 , 156 - 157",8,1904 - 1910,0,3,1932 - 1960
6,Qgt,0 - 6 - 2T,98 - 99,2,1905,1,0,1957 - 1960
7,Lqg,0 - 6 - 0,"78 , 108 , 110 - 111 , 158 - 164",11,1906 - 1908,6,5,1958 - 1963
8,Rt,0 - 6 - 4T,"22 - 23 , 166 - 167",4,1908 - 1911,0,4,1958 - 1963
9,Nqg,0 - 6 - 0,"9 , 38 - 39 , 109 , 112",5,1911,2,3,1958 - 1963


In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_path = "omnitab"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
from typing import Dict
import pandas as pd

def tokenization_with_answer(examples):
    tables_query_answers = [[],[],[]]

    for table, query, summary in zip(examples['table'], examples['query'], examples['summary']):
        try:
            tables_query_answers[0].append(pd.DataFrame(table['rows'], columns=table['header']))
            tables_query_answers[1].append(query)
            tables_query_answers[2].append(summary)
        except:
            continue
    
    model_inputs = tokenizer(tables_query_answers[0], tables_query_answers[1], padding=True, truncation=True, max_length=1024)
    model_inputs['labels'] = tokenizer(answer=tables_query_answers[2], padding=True, truncation=True, max_length=128)["input_ids"] 
    
    return model_inputs

tokenized_dataset_train = train_df.map(tokenization_with_answer, batched=True)
tokenized_dataset_test = test_df.map(tokenization_with_answer, batched=True)

processed_data_train = tokenized_dataset_train.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])
processed_data_test = tokenized_dataset_test.remove_columns(['table','summary', 'row_ids', 'example_id', 'query'])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
processed_data_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)

    rouge = evaluate.load('rouge')

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

    return rouge_results

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model)

train_args = Seq2SeqTrainingArguments(
    output_dir="./train_weight_omnitab",
    learning_rate=3e-5,
    per_device_train_batch_size=22,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    weight_decay=0.01,
    save_total_limit=5,
    warmup_ratio=0.03,
    load_best_model_at_end=True,
    predict_with_generate=True,
    overwrite_output_dir= True,
    gradient_accumulation_steps = 2
)


trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
from datasets import concatenate_datasets
import nltk
import evaluate

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,No log,1.089619,0.439336,0.221206,0.334017,0.395413
2,No log,0.949296,0.491683,0.256243,0.377304,0.443681
4,No log,0.97132,0.504575,0.265964,0.39049,0.453104
6,No log,0.999437,0.499697,0.258538,0.385561,0.447707
8,No log,1.029502,0.498271,0.259015,0.385963,0.445935
9,No log,1.046886,0.49794,0.257966,0.384935,0.446054


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=450, training_loss=0.8654727511935764, metrics={'train_runtime': 3778.0564, 'train_samples_per_second': 5.294, 'train_steps_per_second': 0.119, 'total_flos': 3.717603609005261e+16, 'train_loss': 0.8654727511935764, 'epoch': 9.89})

In [10]:
validate_df = datasets.load_from_disk("data/validate")
validate_df_size = len(validate_df)
step_size = 3
num_batches = validate_df_size // step_size

valid = []

for i in range(num_batches):
    start_index = i * step_size
    end_index = (i + 1) * step_size
    valid.append(validate_df.select(range(start_index, end_index)))

# If there are remaining data points that don't fit into full batches of size 3
if validate_df_size % step_size != 0:
    remaining_data = validate_df_size % step_size
    valid.append(validate_df.select(range(validate_df_size - remaining_data, validate_df_size)))

In [11]:
import numpy as np
rougeL = []
bert = []
# bertscore = evaluate.load("bertscore")

for i in range(len(valid)):
    validate_df = valid[i].map(tokenization_with_answer, batched=True)
    predict_results = trainer.predict(validate_df, max_length = 1024)
    metrics = predict_results.metrics
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]
    
    # bert_score = bertscore.compute(predictions=predictions, references=valid[i]['summary'], lang = "en")
    rougeL.append(metrics['test_rougeLsum'])
    # bert.append(np.mean(bert_score['f1']))

sum(rougeL)/len(rougeL)

Map: 100%|██████████| 3/3 [00:00<00:00,  3.12 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 21.17 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 48.33 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 53.45 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  9.43 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 52.52 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  3.01 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  9.26 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 49.56 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 55.14 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 40.31 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 50.82 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 64.15 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 34.63 examples/s]


Map: 100%|██████████| 3/3 [00:01<00:00,  2.25 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  4.30 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 38.80 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 34.94 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 40.67 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.43 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.33 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 23.74 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 76.30 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.54 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 101.45 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 35.08 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 53.94 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 55.33 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 30.13 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 108.97 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 95.27 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 54.80 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 24.17 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 71.18 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 64.64 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 45.81 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 62.16 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.46 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.05 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 57.09 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 74.85 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 24.04 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 47.10 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 50.83 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 48.12 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  3.25 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 51.27 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 58.28 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 52.92 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 58.58 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 50.51 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 48.13 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 57.36 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 76.96 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 44.92 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 67.91 examples/s]


Map: 100%|██████████| 3/3 [00:01<00:00,  2.48 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 42.25 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  8.41 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 56.04 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  9.27 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 60.27 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 36.81 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 45.78 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00, 43.94 examples/s]


Map: 100%|██████████| 3/3 [00:00<00:00,  5.94 examples/s]


Map: 100%|██████████| 2/2 [00:00<00:00, 34.79 examples/s]


0.4427862668790004

In [16]:
trainer.save_model("omnitab")

In [13]:
validate_df = datasets.load_from_disk("data/validate")
tokenized_dataset_validate = validate_df.map(tokenization_with_answer, batched=True)
processed_data_validate = tokenized_dataset_validate.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query'])
from torch.utils.data import DataLoader

validation_dataloader = DataLoader(
    processed_data_validate,
    batch_size=8,  # You can adjust this
    collate_fn=data_collator
)

import torch
from tqdm import tqdm
import nltk
import evaluate
from datasets import concatenate_datasets

model.eval()
all_predictions = []

with torch.no_grad():
    for batch in tqdm(validation_dataloader):
        # Move batch to the same device as the model
        batch = {k: v.to(model.device) for k, v in batch.items()}
        
        # Generate predictions
        outputs = model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            max_length=128  # Adjust as needed
        )
        
        # Decode predictions
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.extend(decoded_preds)

# Post-process predictions
processed_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in all_predictions]

# Get the reference summaries
reference_summaries = validate_df['summary']
processed_refs = ["\n".join(nltk.sent_tokenize(ref.strip())) for ref in reference_summaries]

# Compute metrics
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=processed_preds, references=processed_refs)

print(rouge_results)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

100%|██████████| 25/25 [01:06<00:00,  2.65s/it]


{'rouge1': 0.4900054344390593, 'rouge2': 0.25062032256417055, 'rougeL': 0.37390804497694863, 'rougeLsum': 0.4404926774885889}


In [14]:
validate_decomposed = datasets.load_from_disk("data/decomposed_validate")
tokenized_dataset_validate_decomposed = validate_decomposed.map(tokenization_with_answer, batched=True)
processed_data_validate_decomposed = tokenized_dataset_validate_decomposed.remove_columns(['table', 'summary', 'row_ids', 'example_id', 'query'])
from torch.utils.data import DataLoader

validation_dataloader = DataLoader(
    processed_data_validate_decomposed,
    batch_size=8,  # You can adjust this
    collate_fn=data_collator
)

import torch
from tqdm import tqdm

model.eval()
decomposed_predictions = []

with torch.no_grad():
    for batch in tqdm(validation_dataloader):
        # Move batch to the same device as the model
        batch = {k: v.to(model.device) for k, v in batch.items()}
        
        # Generate predictions
        outputs = model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            max_length=128  # Adjust as needed
        )
        
        # Decode predictions
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decomposed_predictions.extend(decoded_preds)

# Post-process predictions
processed_preds_decomposed = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decomposed_predictions]

# Get the reference summaries
reference_summaries = validate_df['summary']
processed_refs = ["\n".join(nltk.sent_tokenize(ref.strip())) for ref in reference_summaries]

# Compute metrics
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=processed_preds_decomposed, references=processed_refs)

print(rouge_results)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

100%|██████████| 25/25 [00:31<00:00,  1.28s/it]


{'rouge1': 0.4929236370177368, 'rouge2': 0.2556142155252692, 'rougeL': 0.37678567786803, 'rougeLsum': 0.4437173395736421}


In [16]:
import numpy as np
bertscore = evaluate.load("bertscore")
bert = []
bert_score = bertscore.compute(predictions=processed_preds_decomposed, references=processed_refs, lang = "en")
bert.append(np.mean(bert_score['f1']))
np.average(bert)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.9015538358688354

In [17]:
bertscore = evaluate.load("bertscore")
bert = []
bert_score = bertscore.compute(predictions=processed_preds, references=processed_refs, lang = "en")
bert.append(np.mean(bert_score['f1']))
import numpy as np
np.average(bert)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.900832970738411

In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Initialize a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to calculate the number of tokens in a table's markdown format
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to calculate the number of tokens in a table's markdown format
def count_tokens_in_markdown(df):
    markdown = df.to_markdown(index=False)
    return len(tokenizer.tokenize(markdown))

# List to store differences in token counts for each entry
token_differences = []
token_dfs = []
token_decomposeds = []

# Assume both datasets have the same number of entries
for item_df, item_decomposed in zip(validate_df, validate_decomposed):
    # Convert to DataFrames
    df_1 = to_pandas(item_df)
    df_2 = to_pandas(item_decomposed)
    
    # Calculate token counts
    tokens_df = count_tokens_in_markdown(df_1)
    tokens_decomposed = count_tokens_in_markdown(df_2)
    
    # Calculate the difference and store it
    token_differences.append(abs(tokens_df - tokens_decomposed))
    
    token_dfs.append(tokens_df)
    token_decomposeds.append(tokens_decomposed)

# Calculate the average difference across all entries
average_difference = sum(token_differences) / len(token_differences)

print(f"Average difference in number of tokens across all entries: {average_difference}")
print(np.mean(token_dfs))
print(np.mean(token_decomposeds))


Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


Average difference in number of tokens across all entries: 39.98
450.875
416.095


In [26]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeLsum'], use_stemmer=True)

rouge_scores_decomposed = []
for pred, ref in zip(processed_preds_decomposed, processed_refs):
    score = scorer.score(ref, pred)
    rouge_scores_decomposed.append(score['rougeLsum'].fmeasure)


rouge_scores = []
for pred, ref in zip(processed_preds, processed_refs):
    score = scorer.score(ref, pred)
    rouge_scores.append(score['rougeLsum'].fmeasure)
    
print(np.average(rouge_scores_decomposed))
print(np.average(rouge_scores))

0.46026598461125856
0.45748969297451986


In [38]:
import random

random_indices = random.sample(range(200), 100)

for i in random_indices:
    example = validate_df[i]
    df = to_pandas(example)
    query = example['query']
    generated_summary = decomposed_predictions[i]
    expected_summary = reference_summaries[i]
    
    # Calculate ROUGE score
    score = scorer.score(expected_summary, generated_summary)


    # Print results
    print(f"Index: {i}")
    print(df.to_markdown())
    print(f"Query: {query}")
    print("\n")
    print(f"Generated Summary: {generated_summary}")
    print("\n")
    print(f"Expected Summary: {expected_summary}")
    print("--" * 50)
    print("\n")

Index: 129
|    | Team                        |   Wins |   Losses |   Percentage | Home   | Away   | Streak   |
|---:|:----------------------------|-------:|---------:|-------------:|:-------|:-------|:---------|
|  0 | Louisiana Swashbucklers     |     13 |        1 |        0.928 | 7 - 0  | 6 - 1  | Lost 1   |
|  1 | Corpus Christi Hammerheads  |     12 |        2 |        0.857 | 6 - 1  | 6 - 1  | Won 4    |
|  2 | Frisco Thunder              |      8 |        6 |        0.572 | 4 - 3  | 4 - 3  | Lost 3   |
|  3 | Odessa Roughnecks           |      8 |        6 |        0.572 | 5 - 2  | 3 - 4  | Won 2    |
|  4 | Katy Ruff Riders            |      7 |        7 |        0.5   | 4 - 3  | 3 - 4  | Won 1    |
|  5 | San Angelo Stampede Express |      4 |       10 |        0.286 | 3 - 4  | 1 - 6  | Lost 1   |
|  6 | Centex Barracudas           |      2 |       12 |        0.143 | 1 - 6  | 1 - 6  | Lost 3   |
|  7 | Alaska Wild                 |      2 |       12 |        0.143 | 2 - 5  |