# Phân loại tập văn bản 20newsgroup với Multinomial Naive Bayes và LinearSVC

- Những công cụ được sử dụng:
  + Thư viện MultimonialNB và LinearSVC của sklearn. 
  + Sử dụng TfidfVectorizer (Term Frequency – Inverse Document Frequency) để sinh ra các feature vector
- Sử dụng lại tập văn bản đã được xử lý khi làm việc với Neural Network

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import random
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

In [2]:
X_train = []
y_train = []

X_test = []
y_test = []

In [3]:
# đưa dữ diệu từ file tập train vào X_train
for i in os.listdir('E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\20newsgroup_train'):
    sub_directory_train = 'E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\20newsgroup_train' + '\\' + i
    for news_file in os.listdir(sub_directory_train):
        news_file_path = sub_directory_train + '\\' + news_file
        with open(news_file_path) as f:
            news = f.read()
            X_train.append(news)

In [4]:
# đưa dữ diệu từ file tập test vào X_test
for i in os.listdir('E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\20newsgroup_test'):
    sub_directory_test = 'E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\20newsgroup_test' + '\\' + i
    for news_file in os.listdir(sub_directory_test):
        news_file_path = sub_directory_test + '\\' + news_file
        with open(news_file_path) as f:
            news = f.read()
            X_test.append(news)

In [5]:
# đưa các nhãn lớp (chủ đề) của các file train vào y_train

with open('E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\train_label.txt') as f :
    all_the_lines = f.readlines()
    for i in all_the_lines: 
        i = int(i)
        y_train.append(i)

In [6]:
# đưa các nhãn lớp (chủ đề) của các file test vào y_test

with open('E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\test_label.txt') as f :
    all_the_lines = f.readlines()
    for i in all_the_lines:         
        i = int(i)
        y_test.append(i)

In [7]:
#Làm xáo trộn tập train
mapIndexPosition = list(zip(X_train, y_train))
random.shuffle(mapIndexPosition)
X_train, y_train = zip(*mapIndexPosition)

In [8]:
#Làm xáo trộn tập test
mapIndexPosition = list(zip(X_test, y_test))
random.shuffle(mapIndexPosition)
X_test, y_test = zip(*mapIndexPosition)

In [9]:
X_train = list(X_train)
y_train = np.array(y_train)

X_test = list(X_test)
y_test = np.array(y_test)

# Multinomial Naive Bayes

In [10]:
# Không loại bỏ stopwords và tham số alpha
train_NB = Pipeline([ ('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())])

train_NB.fit(X_train, y_train)
print("Accuracy: " + str(train_NB.score(X_test, y_test)))

Accuracy: 0.7926181625066383


In [11]:
# Loại bỏ stopwords
train_NB = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))), ('classifier', MultinomialNB())])

train_NB.fit(X_train, y_train)
print("Accuracy: " + str(train_NB.score(X_test, y_test)))

Accuracy: 0.8301911842804036


In [14]:
# Loại bỏ stopwords kết hợp với tham số alpha
for alpha in [5, 0.5, 0.05, 0.005, 0.0005]:
    train_NB = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))), 
                          ('classifier', MultinomialNB(alpha = alpha))])

    train_NB.fit(X_train, y_train)
    print("Alpha: " + str(alpha) + ", Accuracy: " + str(train_NB.score(X_test, y_test)))

Alpha: 5, Accuracy: 0.798194370685077
Alpha: 0.5, Accuracy: 0.841343600637281
Alpha: 0.05, Accuracy: 0.8583377588953797
Alpha: 0.005, Accuracy: 0.8570100902814658
Alpha: 0.0005, Accuracy: 0.8490440785979819


In [15]:
# Loại bỏ tham số alpha = 0.05 cùng với tham số min_df = 5 ( loại bỏ những từ xuất hiện ít hơn 5 lần )
train_NB = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'), min_df=5)), 
                          ('classifier', MultinomialNB(alpha = 0.05))])

train_NB.fit(X_train, y_train)
print("Accuracy: " + str(train_NB.score(X_test, y_test)))

Accuracy: 0.8568773234200744


In [16]:
train_NB = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))), 
                          ('classifier', MultinomialNB(alpha = 0.05))])

train_NB.fit(X_train, y_train)

labels = []
for i in os.listdir('E:\\Project 1\\20_newsgroups_exercise\\20_newsgroup_train_test\\20newsgroup_train'):
    labels.append(i)
y_pred_NB = train_NB.predict(X_test)

In [17]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred_NB, target_names=labels))

                          precision    recall  f1-score   support

             alt.atheism       0.81      0.82      0.82       320
           comp.graphics       0.77      0.74      0.75       389
 comp.os.ms-windows.misc       0.75      0.77      0.76       394
comp.sys.ibm.pc.hardware       0.73      0.80      0.77       393
   comp.sys.mac.hardware       0.86      0.83      0.85       384
          comp.windows.x       0.86      0.78      0.82       392
            misc.forsale       0.85      0.84      0.84       389
               rec.autos       0.92      0.93      0.92       396
         rec.motorcycles       0.93      0.97      0.95       398
      rec.sport.baseball       0.95      0.94      0.95       398
        rec.sport.hockey       0.95      0.97      0.96       400
               sci.crypt       0.87      0.96      0.91       396
         sci.electronics       0.83      0.76      0.79       392
                 sci.med       0.93      0.86      0.90       396
         

# LinearSVC

In [20]:
# Không loại bỏ stopwords
train_SVC = Pipeline([ ('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC())])

train_SVC.fit(X_train, y_train)
print("Accuracy: " + str(train_SVC.score(X_test, y_test)))

Accuracy: 0.8749336165693044


In [19]:
# Loại bỏ stopwords
train_SVC = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words = stopwords.words('English'))), ('classifier', LinearSVC())])

train_SVC.fit(X_train, y_train)
print("Accuracy: " + str(train_SVC.score(X_test, y_test)))

Accuracy: 0.8737387148167818


In [21]:
y_pred_SVC = train_SVC.predict(X_test)

In [22]:
print(metrics.classification_report(y_test, y_pred_SVC, target_names=labels))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.84      0.84       320
           comp.graphics       0.76      0.81      0.78       389
 comp.os.ms-windows.misc       0.79      0.82      0.80       394
comp.sys.ibm.pc.hardware       0.76      0.77      0.76       393
   comp.sys.mac.hardware       0.87      0.84      0.86       384
          comp.windows.x       0.87      0.76      0.81       392
            misc.forsale       0.83      0.92      0.87       389
               rec.autos       0.93      0.92      0.93       396
         rec.motorcycles       0.97      0.97      0.97       398
      rec.sport.baseball       0.95      0.96      0.95       398
        rec.sport.hockey       0.97      0.97      0.97       400
               sci.crypt       0.94      0.94      0.94       396
         sci.electronics       0.81      0.81      0.81       392
                 sci.med       0.92      0.90      0.91       396
         