In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [2]:
# datapath = './data/train.csv'
# train_save='./data/subset_train.csv'
# test_save='./data/subset_test.csv'
# val_save='./data/subset_val.csv'

datapath = 'filtered_data.csv'
train_save='./data/train.csv'
test_save='./data/test.csv'
val_save='./data/val.csv'


In [3]:
df = pd.read_csv(datapath)
if datapath == './data/train.csv':
    df = df.sample(n=10000, random_state=42)
    print("Sampled 10k random data points")
else:
    mask = (df['abstract_tokens'].between(128, 450)) & (df['title_tokens'].between(8, 32))
    df = df[mask]
    df.shape

    # import matplotlib.pyplot as plt

    # # Plot histograms
    # plt.figure(figsize=(12, 6))

    # plt.subplot(1, 2, 1)
    # plt.hist(df['abstract_tokens'], bins=50, color='blue', alpha=0.7)
    # plt.title('Distribution of Abstract Tokens')
    # plt.xlabel('Number of Tokens')
    # plt.ylabel('Frequency')

    # plt.subplot(1, 2, 2)
    # plt.hist(df['title_tokens'], bins=50, color='green', alpha=0.7)
    # plt.title('Distribution of Title Tokens')
    # plt.xlabel('Number of Tokens')
    # plt.ylabel('Frequency')

    # plt.tight_layout()
    # plt.show()

In [4]:
try:
    df = df.drop(columns=["abstract_tokens", "title_tokens"])
except Exception:
    pass

df.shape

(74488, 3)

# Cosine similarity trim

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['abstract'].tolist() + df['title'].tolist())

title_vectors = tfidf_matrix[:len(df)]
abstract_vectors = tfidf_matrix[len(df):]

df["cosine_similarity"] = [
    cosine_similarity(title_vectors[i], abstract_vectors[i])[0][0]
    for i in range(len(df))
]
df["cosine_similarity"].describe()

count    74488.000000
mean         0.410170
std          0.153956
min          0.000000
25%          0.297470
50%          0.408061
75%          0.519604
max          0.903722
Name: cosine_similarity, dtype: float64

### Trim data where cosine similarity is above 0.3

In [6]:
df = df[df["cosine_similarity"] >= 0.8]
df = df.drop(columns=["cosine_similarity"])
# df.to_csv("trimmed_filtered_data.csv", index=False)
df.shape

(223, 3)

# Stratified split

In [7]:
def stratified_split_and_count(tokenized_df, min_freq=3, qcut=5):

    # Flatten all tokens from titles and abstracts
    all_tokens = (
        tokenized_df['tokenized_title'].explode().tolist()
        + tokenized_df['tokenized_abstract'].explode().tolist()
    )

    token_freq = Counter(all_tokens)

    valid_tokens = {tok for tok, freq in token_freq.items() if freq >= min_freq}

    # Calculate vocabulary for each abstract
    tokenized_df["vocab"] = tokenized_df.apply(
            lambda row: set(
                tok for tok in row["tokenized_abstract"] + row["tokenized_title"]
                if tok in valid_tokens
            ),
            axis=1,
        )

    # Create a global vocabulary
    global_vocab = set().union(*tokenized_df["vocab"])

    # Calculate overlap with global vocabulary
    tokenized_df["overlap"] = tokenized_df["vocab"].apply(
        lambda x: len(x & global_vocab) / len(global_vocab) if global_vocab else 0
    )

    # Bin overlap into categories for stratification
    tokenized_df["overlap_bin"] = pd.qcut(tokenized_df["overlap"], q=qcut, labels=False)

    # Perform stratified splitting
    train_idx, temp_idx = train_test_split(
        tokenized_df.index,
        test_size=0.2,
        stratify=tokenized_df["overlap_bin"],
        random_state=42,
    )
    val_idx, test_idx = train_test_split(
        temp_idx,
        test_size=0.5,
        stratify=tokenized_df.loc[temp_idx, "overlap_bin"],
        random_state=42,
    )

    # Extract dataframes
    train_df = tokenized_df.loc[train_idx]
    val_df = tokenized_df.loc[val_idx]
    test_df = tokenized_df.loc[test_idx]

    # Count words in val and test that are not in train
    train_vocab = set().union(*train_df["vocab"])
    val_vocab = set().union(*val_df["vocab"])
    test_vocab = set().union(*test_df["vocab"])

    val_not_train = len(val_vocab - train_vocab)
    test_not_train = len(test_vocab - train_vocab)

    return {
        "train_df": df.loc[train_idx],
        "val_df": df.loc[val_idx],
        "test_df": df.loc[test_idx],
        "val-train": val_not_train,
        "test-train": test_not_train,
    }

In [8]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-small")

tokenized_df = pd.DataFrame()
tokenized_df['tokenized_title'] = df['title'].apply(lambda x: tokenizer.tokenize(x))
tokenized_df['tokenized_abstract'] = df['abstract'].apply(lambda x: tokenizer.tokenize(x))

In [9]:
cache = stratified_split_and_count(tokenized_df)
print(f"Number of words that are in VALIDATION set but no in TRAIN set: {cache['val-train']}")
print(f"Number of words that are in TEST set but not in TRAIN set: {cache['test-train']}")

Number of words that are in VALIDATION set but no in TRAIN set: 69
Number of words that are in TEST set but not in TRAIN set: 63


# Saving the datasets

In [10]:
cache['train_df'].to_csv(train_save, index=False)
cache['val_df'].to_csv(val_save, index=False)
cache['test_df'].to_csv(test_save, index=False)

print("Size of train, val and test datasets:")
print(f"Train: {cache['train_df'].shape} saved to {train_save}")
print(f"Validation: {cache['val_df'].shape} saved to {val_save}")
print(f"Test: {cache['test_df'].shape} saved to {test_save}")

Size of train, val and test datasets:
Train: (178, 3) saved to ./data/train.csv
Validation: (22, 3) saved to ./data/val.csv
Test: (23, 3) saved to ./data/test.csv
