In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [3]:
X,y = fetch_20newsgroups(subset='all',
                        categories =['comp.windows.x',
                                    'rec.autos',
                                    'rec.sport.baseball',
                                    'sci.space'],
                        return_X_y=True,
                        remove=['headers','footers','quotes'])

In [17]:
topic_dic = {0:'windows', 1:'autos',2:'baseball,',3:'space'}

In [5]:
y

array([3, 0, 2, ..., 0, 3, 1], dtype=int64)

In [11]:
# put data into a dataframe
#data = pd.DataFrame([X,y], columns=['document', 'topic'])
data = pd.DataFrame()
data['document'] = X
data['topic'] = y
data

Unnamed: 0,document,topic
0,And one of my profs is the chief engineer for ...,3
1,"Enclosed are the rules, guidelines and related...",0
2,\nI grew up listening to Harry Carey call the ...,2
3,Original to: szabo@techbook.com\nG'day szabo@t...,3
4,\n\n\n\nTry the 'M.Sc. Computing Science' cour...,0
...,...,...
3954,\n\n\n\n\n Anaheim.,2
3955,l\n\n\ndiamond star cars (Talon/Eclipse/Laser)...,1
3956,Update your 385 to HP-UX 9.0. You get an R5 s...,0
3957,\nI disagree. It think the average joe is int...,3


In [16]:
print(data[data.topic==0].iloc[65,0])

hi, is there anybody has some example programs about using
 the internationalization features in X11R5 ? Such as a small
 X program just to show Chinese texts in wondows, menu bar or
 icons... Thanks in advance.


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [29]:
pipe = Pipeline(steps=[
    ('vectorizer', CountVectorizer()),
    ('classificer', LogisticRegression(max_iter=5000))
])


In [30]:
pipe.fit(X_train,y_train)

In [31]:
# size of vocab

In [32]:
y_test_pred = pipe.predict(X_test)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_test_pred)

array([[238,  11,   7,   9],
       [  4, 215,  10,  20],
       [  4,  28, 213,   7],
       [  6,  23,  10, 185]], dtype=int64)

In [36]:
from sklearn.model_selection import GridSearchCV

In [43]:
param_dic = {
            'vectorizer__stop_words': [None,'english'],
            'vectorizer__ngram_range':[(1,1),(1,2)],
            'vectorizer__max_df':[0.5,0.7,0.9,1.0],
            'vectorizer__max_features': [25000,10000]
            }

In [44]:
grid = GridSearchCV(pipe,param_dic,cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [46]:
grid.best_params_
best_pipe = grid.best_estimator_
y_test_pred = best_pipe.predict(X_test)
confusion_matrix(y_test,y_test_pred)

array([[239,  13,   5,   8],
       [  4, 219,  10,  16],
       [  2,  24, 218,   8],
       [  4,  22,   7, 191]], dtype=int64)

In [47]:
X_new_data = ['I always wanted to be an astronaut', 'I hate windows 10', 'that game was terrible']

In [49]:
best_pipe.predict(X_new_data)

array([1, 1, 2], dtype=int64)

# How does the model make predictions?


In [51]:
coefficients = best_pipe['classificer'].coef_
coefficients.shape

(4, 25000)

In [53]:
# dataframe with model coefficients
coef_df = pd.DataFrame(coefficients.T,
                         index=best_pipe['vectorizer'].get_feature_names_out(),
                         columns = ['windows','auto','baseball','space'])

In [58]:
coef_df.head(3)

Unnamed: 0,windows,auto,baseball,space
00,0.1016793,-0.005083397,0.01038849,-0.1069844
00 00,0.004360059,-0.0001154123,-0.0002108415,-0.004033805
00 02,-2.579259e-07,-2.571702e-07,7.735476e-07,-2.584516e-07


In [56]:
# top "windows" coefficients
coef_df.windows.sort_values(ascending=False).head(20)

subscribe            1.688603
window               1.074162
xterm                1.058839
use                  1.042290
windows              1.040616
server               1.016124
motif                0.980362
hi                   0.951519
instead              0.943957
widget               0.938277
unsubscribe          0.935091
using                0.883162
resource             0.878614
widgets              0.857531
x11r5                0.831268
library              0.812499
group                0.789229
mit                  0.756769
clients              0.726335
xtvaappinitialize    0.707168
Name: windows, dtype: float64

In [57]:
# top "space" coefficients
coef_df.space.sort_values(ascending=False).head(20)

space         1.847914
orbit         1.066264
shuttle       0.925131
solar         0.859181
launch        0.852418
project       0.851266
earth         0.814572
nasa          0.800604
sky           0.778410
spacecraft    0.763814
moon          0.672060
real          0.647953
sci           0.638162
maybe         0.614296
objects       0.608744
hst           0.586904
moments       0.584297
sounds        0.583404
news          0.579154
plane         0.564815
Name: space, dtype: float64

In [59]:
# top "baseball" coefficients
coef_df.baseball.sort_values(ascending=False).head(20)

baseball    1.309586
team        1.124591
game        1.104616
cubs        1.102457
games       1.050337
jewish      0.979841
stadium     0.896410
season      0.769919
stats       0.761580
play        0.739626
players     0.716939
career      0.706012
pitchers    0.697967
al          0.664885
mike        0.633539
hit         0.633391
status      0.627003
dl          0.626786
braves      0.614646
yankee      0.612130
Name: baseball, dtype: float64

## Naive Bayes Model

In [None]:
# Logistic regression is slow, naive bays is faster