In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

categories = [
    'comp.graphics',
    'rec.sport.baseball',
    'sci.space',
    'talk.politics.mideast',
    'rec.sport.hockey'   # added new category
]

newsgroups = fetch_20newsgroups(subset='train', categories=categories)

df = pd.DataFrame({
    'text': newsgroups.data,
    'target': newsgroups.target
})

df['target_name'] = df['target'].apply(lambda x: newsgroups.target_names[x])

print(df.head())

                                                text  target  \
0  From: juvirtan@klaava.Helsinki.FI (Jukka A Vir...       2   
1  From: kozloce@wkuvx1.bitnet\nSubject: Re: Tie ...       2   
2  Organization: University of Notre Dame - Offic...       1   
3  From: henry@zoo.toronto.edu (Henry Spencer)\nS...       3   
4  From: m_klein@pavo.concordia.ca (CorelMARK!)\n...       1   

          target_name  
0    rec.sport.hockey  
1    rec.sport.hockey  
2  rec.sport.baseball  
3           sci.space  
4  rec.sport.baseball  


### Article Distribution by each category

In [3]:
for i in range(5):
  print("Category ",i," : ",len(df[df['target']==i]))

Category  0  :  584
Category  1  :  597
Category  2  :  600
Category  3  :  593
Category  4  :  564


### Sample articles from each category  

In [4]:
for i in range(5):
  print(f"Category {i} {df[df['target']==i]['target_name'].iloc[0]}: {df[df['target']==i]['text'].iloc[0]}")

Category 0 comp.graphics: From: ajackson@cch.coventry.ac.uk (Alan Jackson)
Subject: MPEG Location
Nntp-Posting-Host: cc_sysh
Organization: Coventry University
Lines: 11


Can anyone tell me where to find a MPEG viewer (either DOS or
Windows).

Thanks in advance.

-- 
Alan M. Jackson      Mail : ajackson@cch.cov.ac.uk

     Liverpool Football Club - Simply The Best
              "You'll Never Walk Alone"

Category 1 rec.sport.baseball: Organization: University of Notre Dame - Office of Univ. Computing
From: <RVESTERM@vma.cc.nd.edu>
Subject: Re: NL vs. AL?
 <93102.164224RVESTERM@vma.cc.nd.edu> <1993Apr13.184311.16351@news.yale.edu>
Lines: 23

In article <1993Apr13.184311.16351@news.yale.edu>, (Sean Garrison) says:
>
>In article <93102.164224RVESTERM@vma.cc.nd.edu>, RVESTERM@vma.cc.nd.edu
>wrote:
>
>> pitchers who are doing well are
>> more likely to be taken out of the game in the nl than they are in the al,
>> so it seems to me that the al, not the nl, promotes pitchers' duels.
>>
>> bo

In [5]:
!pip install --user -U nltk



In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import string
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng')
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def clean_tokens(tokens):
    return [
        lemmatizer.lemmatize(w.lower())
        for w in tokens
        if w.lower() not in stop_words            # remove stopwords
        and w not in string.punctuation           # remove punctuation
        and w.isalpha()                           # remove numbers / non-alphabetic
    ]


df['tokenised']=[word_tokenize(x) for x in df['text']]
df['filtered'] = df['tokenised'].apply(
    lambda tokens: [w for w in tokens if w.lower() not in stop_words]
)
df['lemmatized']= df['filtered'].apply(lambda texts :clean_tokens(texts))
# i used lemmatized instead of stemmed because training is one time cost. And it will increase the quality too

df['pos_tags']=df['lemmatized'].apply(lambda token:[pos_tag(token)])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['lemmatized'].apply(lambda x: ' '.join(x)))

print("BoW shape:", X_bow.shape)


BoW shape: (2938, 29693)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['lemmatized'].apply(lambda x: ' '.join(x)))

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (2938, 29693)


In [11]:
from gensim.models import Word2Vec

cbow_model = Word2Vec(sentences=df['lemmatized'], vector_size=100, window=5, min_count=2, sg=0)
skip_model= Word2Vec(sentences=df['lemmatized'], vector_size=100, window=5, min_count=2, sg=1)
print(cbow_model.wv['space'])
print(skip_model.wv['space'])

import numpy as np

def doc_vector(tokens, model):
    tokens = [t for t in tokens if t in model.wv]  # keep only words in vocab
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

df['doc_vec'] = df['lemmatized'].apply(lambda x: doc_vector(x, cbow_model))


[-0.9053326   1.52538     0.62565315 -1.569798    0.4578813  -0.6376697
  2.0609922   1.7663788  -0.12770066 -0.5194172   0.09690235 -1.9792314
  2.022975    0.9424169   2.238214    0.08094811 -0.07205864  1.1550025
 -0.6455151  -1.0754907   0.50554883 -1.2737622   0.45528352 -0.60792685
  1.5694332  -0.70128703 -0.26925805 -0.19175851 -0.6776452  -2.2995374
  0.16506682 -0.03748273 -0.6249189   0.09971326  1.5196351   0.6759365
 -0.30594096  1.4932548   2.1805696  -1.394475    0.03207586 -0.9855001
 -0.44926506 -0.32113865 -0.16104682  0.5731132  -0.2521947   0.56789213
  1.2281009   0.45331702  0.80161065  0.04377218 -0.44880545 -1.9102753
  0.31686133 -0.32898262  0.8351898  -0.4840352  -0.44767937  0.3405744
  0.05599922 -0.20480722 -0.02550899  0.9642617  -0.65214795  0.98851526
 -0.635061   -0.04114639  0.4127789  -0.0261205  -0.81442046 -0.35484567
  1.0991627   0.11369587 -0.17492272  2.3146703   0.41382343 -0.80644107
 -0.08720588  0.65994537 -0.14614217  0.22420658  0.8999574

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Convert tokenized/lemmatized lists back to string for vectorizers
df['clean_text'] = df['lemmatized'].apply(lambda x: " ".join(x))

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['target'], test_size=0.2, random_state=42)

# ---- Bag of Words ----
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

lr_bow = LogisticRegression(max_iter=200)
lr_bow.fit(X_train_bow, y_train)
print("BoW + Logistic Regression")
print(classification_report(y_test, lr_bow.predict(X_test_bow)))

# ---- TF-IDF ----
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

lr_tfidf = LogisticRegression(max_iter=200)
lr_tfidf.fit(X_train_tfidf, y_train)
print("TF-IDF + Logistic Regression")
print(classification_report(y_test, lr_tfidf.predict(X_test_tfidf)))


BoW + Logistic Regression
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       119
           1       0.96      0.95      0.95       117
           2       0.98      0.96      0.97       106
           3       0.97      0.98      0.97       128
           4       0.99      0.95      0.97       118

    accuracy                           0.96       588
   macro avg       0.96      0.96      0.96       588
weighted avg       0.96      0.96      0.96       588

TF-IDF + Logistic Regression
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       119
           1       0.98      0.97      0.98       117
           2       0.99      0.98      0.99       106
           3       0.98      0.98      0.98       128
           4       1.00      0.97      0.99       118

    accuracy                           0.98       588
   macro avg       0.98      0.98      0.98       588
weighted avg       0.9

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Tokenizer for sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['clean_text'])

X_seq = tokenizer.texts_to_sequences(df['clean_text'])
X_pad = pad_sequences(X_seq, maxlen=200)  # pad to equal length
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# ---- Simple LSTM ----
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(df['target_name']), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

print("LSTM Test Accuracy:", model.evaluate(X_test, y_test)[1])


Epoch 1/5
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 368ms/step - accuracy: 0.2189 - loss: 7.6800 - val_accuracy: 0.1803 - val_loss: 4.2852
Epoch 2/5
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 355ms/step - accuracy: 0.2226 - loss: 3.0475 - val_accuracy: 0.1990 - val_loss: 1.6867
Epoch 3/5
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 356ms/step - accuracy: 0.1837 - loss: 1.6673 - val_accuracy: 0.1803 - val_loss: 1.6435
Epoch 4/5
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 351ms/step - accuracy: 0.1963 - loss: 1.6396 - val_accuracy: 0.1803 - val_loss: 1.6354
Epoch 5/5
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 355ms/step - accuracy: 0.2025 - loss: 1.6300 - val_accuracy: 0.1803 - val_loss: 1.6325
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.1563 - loss: 1.6360
LSTM Test Accuracy: 0.18027210235595703
