In [10]:


# 01_data_preprocessing.ipynb

# 1. Imports & Download
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords




# 1. Define the correct column names
col_names = [
    'newsID',
    'category',
    'subcategory',
    'title',
    'abstract',
    'url',
    'entities',
    'abstract_entities'
]

# 2. Read with header=None and your names
file_path = '../data/news.tsv/news.tsv'
news = pd.read_csv(
    file_path,
    sep='\t',
    header=None,       # <-- no header row in the file
    names=col_names,   # <-- assign these names in order
    dtype=str
)

# 3. Verify
print(news.columns.tolist())
print(news.head())




# 3. Basic cleaning
news = news.dropna(subset=['title', 'abstract'])
news['text'] = news['title'] + ' ' + news['abstract']

# 4. TF-IDF on combined text
tfidf = TfidfVectorizer(
    max_df=0.8, 
    min_df=5,
    stop_words=stopwords.words('english')
)
tfidf_matrix = tfidf.fit_transform(news['text'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

import os, pickle

# 1. Ensure the results directory exists
os.makedirs('../results', exist_ok=True)

# 2. Save the fitted TF-IDF vectorizer
with open('../results/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# 3. Save the TF-IDF matrix
with open('../results/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

print("Saved tfidf_vectorizer.pkl and tfidf_matrix.pkl into ../results/")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['newsID', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'abstract_entities']
   newsID   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a 