In [7]:
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context

In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re

In [9]:
news = fetch_20newsgroups(subset='all')
print(news.target_names)
print(len(news.data))
print(len(news.target))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
18846
18846


In [14]:
vectorizer_1 = CountVectorizer()
X_1 = vectorizer_1.fit_transform(news.data)
y_1 = news.target

X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

# 使用 MultinomialNB 分類器進行訓練和預測
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("準確率:", accuracy)

report = classification_report(y_test, y_pred)
print("分類報告:")
print(report)

準確率: 0.8503978779840848
分類報告:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       151
           1       0.63      0.91      0.74       202
           2       0.93      0.22      0.36       195
           3       0.60      0.86      0.71       183
           4       0.91      0.87      0.89       205
           5       0.84      0.83      0.84       215
           6       0.94      0.62      0.75       193
           7       0.88      0.93      0.91       196
           8       0.96      0.92      0.94       168
           9       0.98      0.96      0.97       211
          10       0.96      0.96      0.96       198
          11       0.87      0.96      0.91       201
          12       0.90      0.83      0.86       202
          13       0.94      0.92      0.93       194
          14       0.90      0.98      0.94       189
          15       0.78      0.99      0.87       202
          16       0.88      0.93      0.90       1

In [15]:
vectorizer_2 = CountVectorizer(stop_words='english')
X_2 = vectorizer_2.fit_transform(news.data)
y_2 = news.target

X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

# 使用 MultinomialNB 分類器進行訓練和預測
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("準確率:", accuracy)

report = classification_report(y_test, y_pred)
print("分類報告:")
print(report)

準確率: 0.8742705570291777
分類報告:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       151
           1       0.67      0.93      0.78       202
           2       0.96      0.37      0.54       195
           3       0.61      0.86      0.72       183
           4       0.89      0.90      0.89       205
           5       0.85      0.85      0.85       215
           6       0.93      0.69      0.80       193
           7       0.91      0.95      0.93       196
           8       0.95      0.94      0.95       168
           9       0.98      0.96      0.97       211
          10       0.95      0.97      0.96       198
          11       0.92      0.96      0.94       201
          12       0.92      0.83      0.87       202
          13       0.95      0.95      0.95       194
          14       0.92      0.97      0.94       189
          15       0.88      0.99      0.93       202
          16       0.89      0.93      0.91       1

In [18]:
vectorizer_3 = CountVectorizer(stop_words='english', ngram_range=(1,3))
X_3 = vectorizer_3.fit_transform(news.data)
y_3 = news.target

X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

# 使用 MultinomialNB 分類器進行訓練和預測
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("準確率:", accuracy)

report = classification_report(y_test, y_pred)
print("分類報告:")
print(report)

準確率: 0.8984084880636605
分類報告:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       151
           1       0.71      0.88      0.79       202
           2       0.97      0.74      0.84       195
           3       0.70      0.83      0.76       183
           4       0.93      0.85      0.89       205
           5       0.89      0.86      0.87       215
           6       0.90      0.71      0.79       193
           7       0.90      0.95      0.92       196
           8       0.94      0.95      0.94       168
           9       0.96      0.95      0.95       211
          10       0.91      0.97      0.94       198
          11       0.93      0.97      0.95       201
          12       0.94      0.86      0.90       202
          13       0.95      0.92      0.93       194
          14       0.92      0.97      0.95       189
          15       0.93      0.99      0.96       202
          16       0.90      0.95      0.93       1

In [22]:
vectorizer_4 = CountVectorizer(stop_words='english', ngram_range=(1,3),min_df=2, max_df=0.8)
X_4 = vectorizer_4.fit_transform(news.data)
y_4 = news.target

X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, random_state=42)

# 使用 MultinomialNB 分類器進行訓練和預測
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("準確率:", accuracy)

report = classification_report(y_test, y_pred)
print("分類報告:")
print(report)

準確率: 0.8986737400530505
分類報告:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       151
           1       0.75      0.85      0.79       202
           2       0.96      0.74      0.84       195
           3       0.69      0.83      0.75       183
           4       0.91      0.86      0.88       205
           5       0.89      0.86      0.87       215
           6       0.88      0.75      0.81       193
           7       0.90      0.96      0.93       196
           8       0.93      0.94      0.94       168
           9       0.97      0.95      0.96       211
          10       0.91      0.97      0.94       198
          11       0.94      0.97      0.95       201
          12       0.93      0.86      0.89       202
          13       0.95      0.92      0.93       194
          14       0.92      0.97      0.95       189
          15       0.94      0.99      0.96       202
          16       0.91      0.95      0.93       1

In [30]:
vectorizer_5 = CountVectorizer(stop_words='english', ngram_range=(1,4),min_df=2, max_df=0.8)
X_5 = vectorizer_5.fit_transform(news.data)
y_5 = news.target

X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, random_state=42)

# 使用 MultinomialNB 分類器進行訓練和預測
classifier = MultinomialNB(alpha=0.5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("準確率:", accuracy)

report = classification_report(y_test, y_pred)
print("分類報告:")
print(report)