In [4]:
import json

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    return data


In [53]:
import pandas as pd
from ast import literal_eval

def format_page_dataset(json_data):
    sentences = []
    label = []
    sentences_id = []
    document_id = []
    page_id = []

    for document in json_data:
        _sentence_ids = []
        for evidence in document['meta']['evidences']:
            _sentence_ids += evidence[0]['sentence_ids']
            
        for sentence in document["sentences"]:
            sentences.append(sentence['text'])
            label.append(1*(sentence['sentence_id'] in _sentence_ids ))
            sentences_id.append(sentence['sentence_id'])
            document_id.append(document['document_id'])
            page_id.append(sentence['page_idx'])

    return pd.DataFrame({
        'sentences': sentences,
        'label': label,
        'sentence_id': sentences_id,
        'document_id': document_id,
        'page_id': page_id
    })
        

In [57]:
folder_path = "data\\lobbymap\\lobbymap_dataset"

# train
file_path = folder_path + "\\train.jsonl"
jsonl_train = read_jsonl(file_path)
df_train = format_page_dataset(jsonl_train)


file_path = folder_path + "\\test.jsonl"
jsonl_test = read_jsonl(file_path)
df_test = format_page_dataset(jsonl_test)


file_path = folder_path + "\\valid.jsonl"
jsonl_dev = read_jsonl(file_path)
df_dev = format_page_dataset(jsonl_dev)

In [66]:
# training a TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.pipeline import make_pipeline

def train_baselines(X_train, y_train, seed, model="tfidf + LogReg"):
    if X_train.shape[0] != len(y_train):
        raise ValueError("X_train and y_train should have the same number of samples.")

    pipelines = {
        "tfidf + LogReg": make_pipeline(TfidfVectorizer(), LogisticRegression(class_weight='balanced', max_iter=1000, random_state=seed)),
        "random": make_pipeline(DummyClassifier(strategy="uniform")),
        "majority": make_pipeline(DummyClassifier(strategy="most_frequent"))
    }

    pipe = pipelines[model]
    pipe.fit(X=X_train, y=y_train)

    return pipe


In [67]:
X_train = df_train['sentences']
y_train = df_train['label']

pipe_page = train_baselines(X_train=X_train, y_train=y_train, seed=42)

In [68]:
# conventional eval: 
from sklearn.metrics import classification_report

X_test = df_test['sentences']
y_test = df_test['label']

y_pred = pipe_page.predict(X_test)

print(classification_report(y_true=y_test, y_pred=y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.99      0.80      0.88    429503
           1       0.14      0.74      0.23     18760

    accuracy                           0.80    448263
   macro avg       0.56      0.77      0.56    448263
weighted avg       0.95      0.80      0.86    448263



In [158]:
def format_query_dataset(json_data):
    df = pd.DataFrame()

    for document in json_data:
        page_ids = []
        query = []
        stance = []
        for evidence in document['evidences']:
            page_ids += [evidence['page_indices']]
            query += [evidence['query']]
            stance += [evidence['stance']]
        
        _df = pd.DataFrame({
        'page_id': page_ids,
        "query": query,
        "stance": stance
        })
        
        _df['document_id'] = document['document_id']
        _df = _df.explode(column=['page_id'])
        df = pd.concat([df, _df])

    return df

In [159]:
df_query_train = format_query_dataset(jsonl_train)
df_query_test = format_query_dataset(jsonl_test)
df_query_dev = format_query_dataset(jsonl_dev)

In [160]:
df_train.sort_values(by=['sentence_id'], inplace=True)
df_test.sort_values(by=['sentence_id'], inplace=True)
df_dev.sort_values(by=['sentence_id'], inplace=True)

In [161]:
page_train = df_train.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))
page_test = df_test.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))
page_dev = df_dev.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))

In [162]:
page_train = page_train.reset_index()
page_test = page_test.reset_index()
page_dev = page_dev.reset_index()

In [163]:
page_train = page_train.merge(df_query_train, how="left", on=["document_id", "page_id"])
page_test = page_test.merge(df_query_test, how="left", on=["document_id", "page_id"])
page_dev = page_dev.merge(df_query_dev, how="left", on=["document_id", "page_id"])

In [125]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

def train_baselines_multilabel(X_train, y_train, seed, model="tfidf + LogReg"):
    if X_train.shape[0] != len(y_train):
        raise ValueError("X_train and y_train should have the same number of samples.")

    pipelines = {
        "tfidf + LogReg": make_pipeline(TfidfVectorizer(), MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=1000, random_state=seed))),
        "random": make_pipeline(DummyClassifier(strategy="uniform", random_state=seed)),
        "majority": make_pipeline(DummyClassifier(strategy="most_frequent", random_state=seed))
    }

    mlb = MultiLabelBinarizer()
    y_train_bin = mlb.fit_transform(y_train)

    pipe = pipelines[model]
    pipe.fit(X=X_train, y=y_train_bin)

    return pipe, mlb

In [164]:
query_ds = page_train[~page_train['query'].isna()].groupby(['document_id', 'page_id', 'sentences'])['query'].apply(lambda x: [e for e in x]).reset_index()

In [127]:
X_train = query_ds['sentences']
y_train = query_ds['query']

pipe_query, mlb = train_baselines_multilabel(X_train=X_train, y_train=y_train, seed=42)

In [165]:
query_ds_test = page_test[~page_test['query'].isna()].groupby(['document_id', 'page_id', 'sentences'])['query'].apply(lambda x: [e for e in x]).reset_index()

In [146]:
# conventional eval: 
from sklearn.metrics import classification_report

X_test = query_ds_test['sentences']
y_test = query_ds_test['query']

y_test_bin = mlb.transform(y_test)

y_pred_query = pipe_query.predict(X_test)

print(classification_report(y_true=y_test_bin, y_pred=y_pred_query, zero_division=0.0))

              precision    recall  f1-score   support

           0       0.31      0.64      0.42       364
           1       0.34      0.56      0.42        87
           2       0.37      0.71      0.49       149
           3       0.52      0.74      0.61        47
           4       0.54      0.47      0.50       226
           5       0.55      0.52      0.53       197
           6       0.83      0.84      0.84      2260
           7       0.50      0.65      0.57       399
           8       0.51      0.57      0.54        49
           9       0.41      0.66      0.51       290
          10       0.29      0.70      0.41       172
          11       0.33      0.62      0.43       345
          12       0.14      0.36      0.21        44

   micro avg       0.55      0.73      0.63      4629
   macro avg       0.43      0.62      0.50      4629
weighted avg       0.61      0.73      0.65      4629
 samples avg       0.62      0.75      0.65      4629



In [187]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def train_baselines_query_onehot(X_train, y_train, seed, model="tfidf + LogReg"):
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'sentences'),
            ('query', OneHotEncoder(), ['query'])
        ]
    )

    pipelines = {
        "tfidf + LogReg": make_pipeline(preprocessor, LogisticRegression(class_weight='balanced', max_iter=1000, random_state=seed)),
        "random": make_pipeline(DummyClassifier(strategy="uniform", random_state=seed)),
        "majority": make_pipeline(DummyClassifier(strategy="most_frequent", random_state=seed))
    }

    pipe = pipelines[model]
    pipe.fit(X=X_train, y=y_train)

    return pipe

In [169]:
stance_ds = page_train[~page_train['query'].isna()]

In [188]:
X_train = stance_ds[['sentences', 'query']]
y_train = stance_ds['stance']

pipe_stance = train_baselines_query_onehot(X_train=X_train, y_train=y_train, seed=42)

In [204]:
stance_ds_test = page_test[~page_test['query'].isna()].copy()

X_test = stance_ds_test[['sentences', 'query']]
y_test_stance = stance_ds_test['stance']

y_pred_stance = pipe_stance.predict(X_test)

print(classification_report(y_true=y_test_stance, y_pred=y_pred_stance, zero_division=0))

                               precision    recall  f1-score   support

no_position_or_mixed_position       0.43      0.43      0.43      1031
               not_supporting       0.45      0.47      0.46       930
                     opposing       0.30      0.37      0.33       468
          strongly_supporting       0.45      0.53      0.49       729
                   supporting       0.58      0.48      0.52      1560

                     accuracy                           0.46      4718
                    macro avg       0.44      0.46      0.45      4718
                 weighted avg       0.47      0.46      0.47      4718



In [206]:
df_test['y_pred'] = y_pred
query_ds_test['y_pred_query'] = mlb.inverse_transform(y_pred_query)
stance_ds_test['y_pred_stance'] = y_pred_stance

In [319]:
train_page_idx_classifier = page_train.groupby(['document_id', 'page_id', 'sentences'])['query'].agg(lambda x: ~x.isna().all()).reset_index()
test_page_idx_classifier = page_test.groupby(['document_id', 'page_id', 'sentences'])['query'].agg(lambda x: ~x.isna().all()).reset_index()

In [322]:
# Full page learning: 
X_train = train_page_idx_classifier['sentences']
y_train = train_page_idx_classifier['query']

pipe_page_entire = train_baselines(X_train=X_train, y_train=y_train, seed=42)

In [325]:
X_test_entire = test_page_idx_classifier['sentences']
y_test_entire = test_page_idx_classifier['query']

y_pred_entire = pipe_page_entire.predict(X_test_entire)

print(classification_report(y_true=y_test_entire, y_pred=y_pred_entire, zero_division=0))

              precision    recall  f1-score   support

       False       0.92      0.78      0.85     11938
        True       0.54      0.78      0.64      3817

    accuracy                           0.78     15755
   macro avg       0.73      0.78      0.74     15755
weighted avg       0.83      0.78      0.80     15755



In [327]:
test_page_idx_classifier['y_pred_entire'] = y_pred_entire

# Eval

In [219]:
detected_pages = df_test.groupby(['document_id', 'page_id'])['y_pred'].apply(lambda x: sum(x)>1).reset_index()

In [331]:
detected_pages = test_page_idx_classifier[['document_id', 'page_id', 'y_pred_entire']].copy()

In [332]:
detected_page_test = page_test.merge(detected_pages, how='left', on=['document_id', 'page_id'], suffixes=("","_page"))

In [333]:
X_detected = detected_page_test['sentences']

y_q = pipe_query.predict(X_detected)

In [334]:
detected_page_test['y_pred_query'] = mlb.inverse_transform(y_q)

In [335]:
exploded_query = detected_page_test.explode('y_pred_query')

In [336]:
X_detected = exploded_query[~exploded_query['y_pred_query'].isna()][['sentences', 'y_pred_query']].copy()
X_detected = X_detected.rename(columns={"y_pred_query":"query"})

y_s = pipe_stance.predict(X_detected)

In [337]:
exploded_query.loc[~exploded_query['y_pred_query'].isna(), 'y_pred_stance'] = y_s

In [338]:
gold_triplet = exploded_query[['document_id', 'page_id', 'query', 'stance']].drop_duplicates(keep="first").explode('query')
gold_triplet = gold_triplet[~gold_triplet['query'].isna()]
gold_triplet = gold_triplet.groupby(by=['document_id', 'query', 'stance'])['page_id'].agg(lambda x: [e for e in x]).reset_index()

In [341]:
exploded_query.rename(columns={"y_pred_entire":"y_pred"},inplace=True)

In [342]:
pred_triplet = exploded_query[exploded_query['y_pred']].copy()
pred_triplet = pred_triplet[['document_id', 'page_id', 'y_pred_query', 'y_pred_stance']].drop_duplicates(keep="first")
pred_triplet = pred_triplet[~pred_triplet['y_pred_query'].isna()]
pred_triplet = pred_triplet.groupby(by=['document_id', 'y_pred_query', 'y_pred_stance'])['page_id'].agg(lambda x: [e for e in x]).reset_index()

In [343]:
pred_jds = []
for document_id in pred_triplet['document_id'].unique():
    pred_evidences = []
    for i, r in pred_triplet[pred_triplet['document_id'] == document_id].iterrows():
        pred_evidences.append({
            "query": r['y_pred_query'],
            "stance": r['y_pred_stance'],
            "page_indices": r['page_id']
        })
    
    pred_jds.append({
        'document_id': document_id,
        'evidences': pred_evidences
    })

gold_jds = []
for document_id in gold_triplet['document_id'].unique():
    gold_evidences = []
    for i, r in gold_triplet[gold_triplet['document_id'] == document_id].iterrows():
        gold_evidences.append({
            "query": r['query'],
            "stance": r['stance'],
            "page_indices": r['page_id']
        })
    
    gold_jds.append({
        'document_id': document_id,
        'evidences': gold_evidences
    })

In [344]:
from src.lobbymap.evaluate_f1 import evaluate_strict_f1, evaluate_overlap_f1, evaluate_document_f1

result_strict = evaluate_strict_f1(gold_jds=gold_jds, pred_jds=pred_jds)
result_document = evaluate_document_f1(gold_jds=gold_jds, pred_jds=pred_jds)
result_overlap = evaluate_overlap_f1(gold_jds=gold_jds, pred_jds=pred_jds)


print("TF-IDF", "&", result_document['page']['f'], "&", result_document['query']['f'], "&", result_document['stance']['f'], "&", result_overlap['page']['f'], "&", result_overlap['query']['f'], "&", result_overlap['stance']['f'], "&", result_strict['page']['f'], "&", result_strict['query']['f'], "&", result_strict['stance']['f'], "\\\\")

TF-IDF & 63.5 & 57.4 & 50.2 & 65.9 & 42.6 & 34.7 & 39.3 & 25.5 & 20.9 \\


In [308]:
gold_jds = []
for document_id in gold_triplet['document_id'].unique():
    gold_evidences = []
    for i, r in gold_triplet[gold_triplet['document_id'] == document_id].iterrows():
        gold_evidences.append({
            "query": r['query'],
            "stance": r['stance'],
            "page_indices": r['page_id']
        })
    
    gold_jds.append({
        'document_id': document_id,
        'evidences': gold_evidences
    })

pred_jds = []
for document_id in gold_triplet['document_id'].unique():
    pred_evidences = [{
            "query": "energy_transition_&_zero_carbon_technologies",
            "stance": "supporting",
            "page_indices": [0]
        }]
    
    pred_jds.append({
        'document_id': document_id,
        'evidences': pred_evidences
    })

In [312]:
from src.lobbymap.evaluate_f1 import evaluate_strict_f1, evaluate_overlap_f1, evaluate_document_f1

result_strict = evaluate_strict_f1(gold_jds=gold_jds, pred_jds=pred_jds)
result_document = evaluate_document_f1(gold_jds=gold_jds, pred_jds=pred_jds)
result_overlap = evaluate_overlap_f1(gold_jds=gold_jds, pred_jds=pred_jds)


print("Most Frequent", "&", result_document['page']['f'], "&", result_document['query']['f'], "&", result_document['stance']['f'], "&", result_overlap['page']['f'], "&", result_overlap['query']['f'], "&", result_overlap['stance']['f'], "&", result_strict['page']['f'], "&", result_strict['query']['f'], "&", result_strict['stance']['f'], "\\\\")

Most Frequent & 46.7 & 52.6 & 36.8 & 52.0 & 25.7 & 19.8 & 41.2 & 19.6 & 17.5 \\
