In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import spacy

In [3]:
file_path=r'C:\Users\Trisha\PycharmProjects\NewsArticleClassification\data\test.csv'
df = pd.read_csv(file_path)
df.head()  # Check column names

Unnamed: 0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
0,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
1,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
2,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
3,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...
4,4,Open Letter Against British Copyright Indoctri...,The British Department for Education and Skill...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7599 entries, 0 to 7598
Data columns (total 3 columns):
 #   Column                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                           --------------  ----- 
 0   3                                                                                                                                7599 non-null   int64 
 1   Fears for T N pension after talks                                                                                                7599 non-null   object
 2   Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.  7599 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.2+ KB


In [5]:
df.columns

Index(['3', 'Fears for T N pension after talks',
       'Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.'],
      dtype='object')

In [6]:
df = pd.read_csv(file_path, names=["class_index", "title", "description"], header=None)

In [9]:
# Load English NLP model (for tokenization & stopword removal)
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
# Load English NLP model (for tokenization & stopword removal)
nlp = spacy.load("en_core_web_sm")

In [16]:
def clean_text(text):
    """Preprocess text by removing special characters and stopwords."""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove punctuation/numbers
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatize & remove stopwords
    return " ".join(tokens)

In [17]:
df["text"] = df["title"] + " " + df["description"]  # Combine title and description
df["clean_text"] = df["text"].apply(clean_text)

In [18]:
# File paths
EMBEDDING_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\embeddings\conceptnet_numberbatch_vectors.npy"
WORDS_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\embeddings\conceptnet_numberbatch_words.npy"
TRAIN_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\data\train.csv"
TEST_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\data\test.csv"
SAVE_TRAIN_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\data\train.pkl"
SAVE_TEST_PATH = r"C:\Users\Trisha\PycharmProjects\NewsArticleClassification\data\test.pkl"

In [19]:
def load_embeddings():
    """Load ConceptNet Numberbatch embeddings."""
    words = np.load(WORDS_PATH, allow_pickle=True)
    vectors = np.load(EMBEDDING_PATH, allow_pickle=True)
    embeddings = {word: vectors[i] for i, word in enumerate(words)}
    return embeddings

In [20]:
def text_to_vector(text, embeddings, dim=300):
    """Convert text to a vector by averaging word embeddings."""
    words = text.split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if len(vectors) == 0:
        return np.zeros(dim)  # Return zero vector if no words found
    return np.mean(vectors, axis=0)

In [23]:
def process_dataset(filepath, embeddings):
    """Load, clean, and vectorize dataset."""
    df = pd.read_csv(filepath, names=["class_index", "title", "description"], header=None)
    df["text"] = df["title"] + " " + df["description"]  # Merge title and description
    df["clean_text"] = df["text"].apply(clean_text)
    df["vector"] = df["clean_text"].apply(lambda x: text_to_vector(x, embeddings))
    return df

In [24]:
if __name__ == "__main__":
    print("Loading embeddings...")
    embeddings = load_embeddings()

    print("Processing training data...")
    train_data = process_dataset(TRAIN_PATH, embeddings)
    with open(SAVE_TRAIN_PATH, "wb") as f:
        pickle.dump(train_data, f)

    print("Processing test data...")
    test_data = process_dataset(TEST_PATH, embeddings)
    with open(SAVE_TEST_PATH, "wb") as f:
        pickle.dump(test_data, f)

    print("Preprocessing completed! Data saved.")

Loading embeddings...
Processing training data...
Processing test data...
Preprocessing completed! Data saved.
