In [1]:
import platform
import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
PATHWAY_AVAILABLE = False

if platform.system() != "Windows":
    try:
        import pathway as pw
        PATHWAY_AVAILABLE = True
    except Exception:
        PATHWAY_AVAILABLE = False

if not PATHWAY_AVAILABLE:
    class DummySchema:
        pass

    class pw:
        Schema = DummySchema
        class debug:
            @staticmethod
            def table_from_pandas(*args, **kwargs):
                return None

print("Pathway available:", PATHWAY_AVAILABLE)

Pathway available: False


In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
TRAIN_PATH = "C:\\Users\\bhavy\\OneDrive\\Desktop\\IITKGP\\training data.csv"
TEST_PATH  = "C:\\Users\\bhavy\\OneDrive\\Desktop\\IITKGP\\testing data.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.head(), test_df.head()

(    id                   book_name        char  \
 0   46  In Search of the Castaways    Thalcave   
 1  137   The Count of Monte Cristo       Faria   
 2   74  In Search of the Castaways  Kai-Koumou   
 3  109   The Count of Monte Cristo    Noirtier   
 4  104   The Count of Monte Cristo    Noirtier   
 
                                              caption  \
 0                                                NaN   
 1  The Origin of His Connection with the Count of...   
 2                                                NaN   
 3         The Complexity of Family and Personal Life   
 4  Involvement and Turning Point in the French Re...   
 
                                              content       label  
 0  Thalcave‚Äôs people faded as colonists advanced;...  consistent  
 1  Suspected again in 1815, he was re-arrested an...  contradict  
 2  Before each fight he studied the crack-pattern...  consistent  
 3  Villefort‚Äôs drift toward the royalists disappo...  contradict  
 4  

In [5]:
def safe_text(x):
    if pd.isna(x):
        return ""
    return str(x)

In [6]:
NEGATION_WORDS = {
    "not", "never", "no", "none", "nothing", "nowhere",
    "refused", "denied", "unable", "failed", "no longer"
}

ABSOLUTE_WORDS = {
    "always", "never", "completely", "entirely", "impossible"
}

def extract_features(context, backstory):
    context = safe_text(context)
    backstory = safe_text(backstory)

    doc_c = nlp(context)
    doc_b = nlp(backstory)

    # üî• NEW FEATURE (THIS IS THE FIX)
    caption_present = 1 if len(backstory.strip()) > 0 else 0

    # 1Ô∏è‚É£ Semantic similarity
    similarity = (
        doc_c.similarity(doc_b)
        if doc_c.vector_norm and doc_b.vector_norm
        else 0.0
    )

    # 2Ô∏è‚É£ Vector distance
    vector_distance = (
        np.linalg.norm(doc_c.vector - doc_b.vector)
        if doc_c.vector_norm and doc_b.vector_norm
        else 0.0
    )

    # 3Ô∏è‚É£ Named entity overlap
    ents_c = set(ent.text.lower() for ent in doc_c.ents)
    ents_b = set(ent.text.lower() for ent in doc_b.ents)
    entity_overlap = len(ents_c & ents_b) / (len(ents_b) + 1e-5)

    # 4Ô∏è‚É£ Negation mismatch
    neg_c = sum(1 for t in doc_c if t.text.lower() in NEGATION_WORDS)
    neg_b = sum(1 for t in doc_b if t.text.lower() in NEGATION_WORDS)
    neg_diff = abs(neg_c - neg_b)

    # 5Ô∏è‚É£ Absolute-claim mismatch
    abs_c = sum(1 for t in doc_c if t.text.lower() in ABSOLUTE_WORDS)
    abs_b = sum(1 for t in doc_b if t.text.lower() in ABSOLUTE_WORDS)
    abs_diff = abs(abs_c - abs_b)

    return np.array([
        caption_present,   # üî• ADDED FEATURE
        similarity,
        vector_distance,
        entity_overlap,
        neg_diff,
        abs_diff
    ])


In [7]:
X = []
y = []

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    feats = extract_features(row["content"], row["caption"])
    X.append(feats)
    y.append(row["label"])

X = np.vstack(X)
y = np.array(y)

print("Training samples:", len(y))


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 80/80 [00:01<00:00, 70.18it/s]

Training samples: 80





In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [9]:
clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

clf.fit(X_train, y_train)

print(classification_report(y_val, clf.predict(X_val)))

              precision    recall  f1-score   support

  consistent       0.78      0.70      0.74        10
  contradict       0.57      0.67      0.62         6

    accuracy                           0.69        16
   macro avg       0.67      0.68      0.68        16
weighted avg       0.70      0.69      0.69        16



In [10]:
if PATHWAY_AVAILABLE:
    class NarrativeSchema(pw.Schema):
        id: int
        book_name: str
        char: str
        caption: str
        content: str

    pw_table = pw.debug.table_from_pandas(
        train_df[["id", "book_name", "char", "caption", "content"]],
        schema=NarrativeSchema
    )

    print("Pathway table created")
else:
    print("Pathway not available ‚Äî running in fallback mode")

Pathway not available ‚Äî running in fallback mode


In [11]:
test_preds = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    feats = extract_features(row["content"], row["caption"]).reshape(1, -1)
    pred = clf.predict(feats)[0]
    test_preds.append(pred)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60/60 [00:00<00:00, 64.15it/s]


In [12]:
results = pd.DataFrame({
    "id": test_df["id"],
    "prediction": test_preds   # already 'consistent' / 'contradict'
})

results.to_csv(
    r"C:\Users\bhavy\OneDrive\Desktop\IITKGP\results.csv",
    index=False
)

results.head()

Unnamed: 0,id,prediction
0,95,contradict
1,136,contradict
2,59,consistent
3,60,consistent
4,124,contradict
