In [3]:
import re
import time
import random
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
tqdm.pandas()

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Configuration parameters for Task A**

In [6]:
SEED = 42
VALID_SET_SIZE = 500 
TRAIN_FILE = '/kaggle/input/dataset1/train.csv' 
TEST_FILE = '/kaggle/input/dataset1/test.csv' 

# Preprocessing utilities
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()                                            
    text = text.encode('ascii', errors='ignore').decode()              # Remove non-ASCII characters
    text = re.sub(r'[^\w\s]', ' ', text)                               # Remove punctuation 
    tokens = word_tokenize(text)                                       # Tokenize and remove stopwords 
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    tokens = [stemmer.stem(word) for word in tokens]                   # Apply stemming
    return ' '.join(tokens).strip()

def preprocess_title(title):
    if not isinstance(title, str):
        return ""
    title = title.lower()
    title = title.encode('ascii', errors='ignore').decode()
    title = re.sub(r'[^\w\s]', ' ', title)
    return title.strip()


**PREPROCESSING DATASETS...**

In [7]:
print("Loading training data...")
df = pd.read_csv(TRAIN_FILE, encoding='latin-1')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.dropna(subset=['text', 'title'], inplace=True)
df = df[df['text'].str.strip().astype(bool) & df['title'].str.strip().astype(bool)]
df.reset_index(drop=True, inplace=True)

# Split dataset into training and validation sets (Validation = 500 articles)
train_df, valid_df = train_test_split(df, test_size=VALID_SET_SIZE, random_state=SEED, shuffle=True)
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

print("Loading test data...")
test_df = pd.read_csv(TEST_FILE, encoding='latin-1')
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
test_df.dropna(subset=['text', 'title'], inplace=True)
test_df = test_df[test_df['text'].str.strip().astype(bool) & test_df['title'].str.strip().astype(bool)]
test_df.reset_index(drop=True, inplace=True)

print("Preprocessing training data...")
start_preproc_train = time.time()
train_df['clean_text'] = train_df['text'].progress_apply(preprocess_text)
train_df['clean_title'] = train_df['title'].progress_apply(preprocess_title)
# Remove rows that might be empty after cleaning
train_df = train_df[train_df['clean_text'].str.len() > 0].reset_index(drop=True)
preproc_train_time = time.time() - start_preproc_train
print(f"Training data preprocessing completed in {preproc_train_time:.2f} seconds")

print("Preprocessing validation data...")
start_preproc_valid = time.time()
valid_df['clean_text'] = valid_df['text'].progress_apply(preprocess_text)
valid_df['clean_title'] = valid_df['title'].progress_apply(preprocess_title)
valid_df = valid_df[valid_df['clean_text'].str.len() > 0].reset_index(drop=True)
preproc_valid_time = time.time() - start_preproc_valid
print(f"Validation data preprocessing completed in {preproc_valid_time:.2f} seconds")

print("Preprocessing test data...")
start_preproc_test = time.time()
test_df['clean_text'] = test_df['text'].progress_apply(preprocess_text)
test_df['clean_title'] = test_df['title'].progress_apply(preprocess_title)
test_df = test_df[test_df['clean_text'].str.len() > 0].reset_index(drop=True)
preproc_test_time = time.time() - start_preproc_test
print(f"Test data preprocessing completed in {preproc_test_time:.2f} seconds")

# Reporting final dataset sizes for reference
print("\nFinal data sizes:")
print("Train set:", len(train_df))
print("Validation set:", len(valid_df))
print("Test set:", len(test_df))

Loading training data...
Loading test data...
Preprocessing training data...


100%|██████████| 13379/13379 [06:00<00:00, 37.16it/s]
100%|██████████| 13379/13379 [00:00<00:00, 535668.81it/s]


Training data preprocessing completed in 360.06 seconds
Preprocessing validation data...


100%|██████████| 500/500 [00:13<00:00, 36.78it/s]
100%|██████████| 500/500 [00:00<00:00, 410241.00it/s]


Validation data preprocessing completed in 13.60 seconds
Preprocessing test data...


100%|██████████| 100/100 [00:02<00:00, 35.21it/s]
100%|██████████| 100/100 [00:00<00:00, 226841.75it/s]

Test data preprocessing completed in 2.85 seconds

Final data sizes:
Train set: 13379
Validation set: 500
Test set: 100



