In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
csv_in = 'newsgroups.csv'

pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [6]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0,
                 encoding='latin-1')
print(df.shape)
print(df.info())
display(df.head())

(297, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  297 non-null    object
 1   text      297 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB
None


Unnamed: 0,category,text
0,baseball,The Orioles' pitching staff again is having ...
1,baseball,I agree and disagree. John is saying that t...
2,baseball,"Hell, the Orioles' Opening Day game could eas..."
3,baseball,There's a lot of whining about how much playe...
4,baseball,I doubt Henderson would clear waivers. And ...


In [7]:
print(df['category'].value_counts())

space          99
electronics    99
baseball       99
Name: category, dtype: int64


In [8]:
X = df['text']
y = df['category']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=5)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(222,) (222,)
(75,) (75,)


In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names_out()
print('Vocabulary size:', len(vocab))
print(vocab[:10])  

Vocabulary size: 8061
['00' '000' '000000' '0020' '0028' '005' '0065' '01' '02' '02138']


In [11]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<222x8061 sparse matrix of type '<class 'numpy.int64'>'
	with 23599 stored elements in Compressed Sparse Row format>
X_test_bow:
<75x8061 sparse matrix of type '<class 'numpy.int64'>'
	with 6858 stored elements in Compressed Sparse Row format>


In [12]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print(model.classes_)

['baseball' 'electronics' 'space']


In [13]:
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

Train accuracy: 0.9864864864864865


In [None]:

y_test_pred = model.predict(X_test_bow)
df_pred = pd.DataFrame({
            'pred': y_test_pred,
            'true': y_test
          }).reset_index(drop=True)
display(df_pred.head())

Unnamed: 0,pred,true
0,baseball,baseball
1,baseball,electronics
2,electronics,electronics
3,baseball,baseball
4,space,space


In [None]:
 
ctab = pd.crosstab(df_pred['pred'], df_pred['true'])
display(ctab)

true,baseball,electronics,space
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseball,22,2,2
electronics,0,17,1
space,0,5,26


In [None]:

test_score = model.score(X_test_bow, y_test)
print('Test accuracy:', test_score)

Test accuracy: 0.8666666666666667
