In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

# --------------------------
# 1Ô∏è‚É£ Load Sentences
# --------------------------
train_df = pd.read_csv("C:/Users/ashis/OneDrive/Desktop/NLP/Lab5/train_sentences.csv")
val_df   = pd.read_csv("C:/Users/ashis/OneDrive/Desktop/NLP/Lab5/val_sentences.csv")
test_df  = pd.read_csv("C:/Users/ashis/OneDrive/Desktop/NLP/Lab5/test_sentences.csv")

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 998000, Val size: 1000, Test size: 1000


In [None]:
print(train_df["sentence"].isnull().sum())
print(val_df["sentence"].isnull().sum())
print(test_df["sentence"].isnull().sum())

1
0
0


In [6]:
train_df["sentence"] = train_df["sentence"].fillna("")

In [None]:
import numpy as np
from collections import Counter
import math
from tqdm import tqdm

# --------------------------
# 3Ô∏è‚É£ Build TF-IDF from scratch
# --------------------------

# Step 1: Build vocabulary from training data
print("üîÑ Building vocabulary...")
vocab = set()
for sentence in train_df["sentence"]:
    tokens = str(sentence).lower().split()
    vocab.update(tokens)

vocab = sorted(list(vocab))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
print(f"Vocabulary size: {len(vocab)}")

# Step 2: Calculate IDF (Inverse Document Frequency)
print("üîÑ Calculating IDF...")
N = len(train_df)  # Total documents
idf = {}

for word in vocab:
    doc_count = 0
    for sentence in train_df["sentence"]:
        if word in str(sentence).lower().split():
            doc_count += 1
    idf[word] = math.log((N + 1) / (doc_count + 1)) + 1  # Adding 1 to avoid division by zero

# Step 3: Calculate TF-IDF for each document
def calculate_tfidf(sentences, idf, vocab, word_to_idx):
    """Calculate TF-IDF vectors for sentences"""
    tfidf_matrix = np.zeros((len(sentences), len(vocab)))
    
    for doc_idx, sentence in tqdm(enumerate(sentences), total=len(sentences), desc="Computing TF-IDF"):
        tokens = str(sentence).lower().split()
        
        # Calculate term frequency
        tf = Counter(tokens)
        
        # Calculate TF-IDF
        for word, count in tf.items():
            if word in word_to_idx:
                word_idx = word_to_idx[word]
                tf_value = count / len(tokens) if len(tokens) > 0 else 0
                tfidf_matrix[doc_idx, word_idx] = tf_value * idf[word]
    
    return tfidf_matrix

# Calculate TF-IDF for train, val, test
print("üìä Calculating TF-IDF for train set...")
X_train = calculate_tfidf(train_df["sentence"], idf, vocab, word_to_idx)

print("üìä Calculating TF-IDF for validation set...")
X_val = calculate_tfidf(val_df["sentence"], idf, vocab, word_to_idx)

print("üìä Calculating TF-IDF for test set...")
X_test = calculate_tfidf(test_df["sentence"], idf, vocab, word_to_idx)

print(f"\nTrain TF-IDF shape: {X_train.shape}")
print(f"Val TF-IDF shape: {X_val.shape}")
print(f"Test TF-IDF shape: {X_test.shape}")

# --------------------------
# 4Ô∏è‚É£ Save TF-IDF vectors to CSV
# --------------------------
print("\nüíæ Saving TF-IDF to CSV files...")

def save_tfidf_to_csv(X, sentences, vocab, filename):
    df_tfidf = pd.DataFrame(X, columns=vocab)
    df_tfidf.insert(0, "sentence", sentences.values)
    df_tfidf.to_csv(filename, index=False)
    print(f"  ‚úÖ Saved {filename}")

save_tfidf_to_csv(X_train, train_df["sentence"], vocab, "train_tfidf.csv")
save_tfidf_to_csv(X_val, val_df["sentence"], vocab, "val_tfidf.csv")
save_tfidf_to_csv(X_test, test_df["sentence"], vocab, "test_tfidf.csv")

# --------------------------
# 5Ô∏è‚É£ Save TF-IDF matrices as NPZ (sparse format)
# --------------------------
print("\nüíæ Saving TF-IDF matrices as NPZ...")
from scipy.sparse import csr_matrix

X_train_sparse = csr_matrix(X_train)
X_val_sparse = csr_matrix(X_val)
X_test_sparse = csr_matrix(X_test)

scipy.sparse.save_npz("X_train_tfidf.npz", X_train_sparse)
scipy.sparse.save_npz("X_val_tfidf.npz", X_val_sparse)
scipy.sparse.save_npz("X_test_tfidf.npz", X_test_sparse)

print("‚úÖ TF-IDF from scratch computation complete!")

In [None]:
# --------------------------
# Nearest Neighbor search (within validation and test sets)
# Uses cosine similarity on sparse TF-IDF matrices (closest other sentence in same set)
# Outputs CSVs with: sentence, neighbor_sentence, neighbor_index, similarity
# --------------------------
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

# Ensure sparse matrices exist (created earlier)
try:
    X_val_sparse
    X_test_sparse
except NameError:
    # Try loading from saved npz
    import scipy.sparse
    X_val_sparse = scipy.sparse.load_npz("X_val_tfidf.npz")
    X_test_sparse = scipy.sparse.load_npz("X_test_tfidf.npz")

# Helper to compute nearest neighbor within the same set (exclude self)
def compute_self_nearest(X_sparse, sentences, out_csv):
    n_samples = X_sparse.shape[0]
    if n_samples <= 1:
        print(f"Not enough samples ({n_samples}) to compute neighbors for {out_csv}")
        return None

    # NearestNeighbors with cosine returns distance = 1 - cosine_similarity
    nn = NearestNeighbors(n_neighbors=2, metric='cosine', n_jobs=-1)
    nn.fit(X_sparse)

    distances, indices = nn.kneighbors(X_sparse, return_distance=True)
    # distances[:,0] should be 0 (self), indices[:,0] == row index
    # take second column as nearest neighbor excluding self
    neigh_idx = indices[:, 1]
    neigh_dist = distances[:, 1]
    neigh_sim = 1.0 - neigh_dist  # convert to similarity

    # Build DataFrame
    rows = []
    for i in range(n_samples):
        sent = sentences.iloc[i] if hasattr(sentences, 'iloc') else sentences[i]
        ni = int(neigh_idx[i])
        neighbor_sent = sentences.iloc[ni] if hasattr(sentences, 'iloc') else sentences[ni]
        sim = float(neigh_sim[i])
        rows.append({
            'sentence_index': i,
            'sentence': sent,
            'neighbor_index': ni,
            'neighbor_sentence': neighbor_sent,
            'similarity': sim
        })

    df_out = pd.DataFrame(rows)
    df_out.to_csv(out_csv, index=False)
    print(f"Saved nearest-neighbor results ‚Üí {out_csv} ({len(df_out)} rows)")
    return df_out

# Run for validation set
print("Computing nearest neighbors for validation set...")
df_val_nn = compute_self_nearest(X_val_sparse, val_df['sentence'], "val_nearest_neighbors.csv")

# Run for test set
print("Computing nearest neighbors for test set...")
df_test_nn = compute_self_nearest(X_test_sparse, test_df['sentence'], "test_nearest_neighbors.csv")

# Show top sample rows
if df_val_nn is not None:
    print("\nValidation sample matches:")
    display(df_val_nn.head())
if df_test_nn is not None:
    print("\nTest sample matches:")
    display(df_test_nn.head())