In [1]:
# import the new dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn import model_selection 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
# loading the new dataset
news = fetch_20newsgroups(subset='all')
print(news.target_names)
print(len(news.data))
print(len(news.target))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
18846
18846


In [3]:
# split the new data into the training set and testing set
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target, random_state = 1234)
# train = fetch_20newsgroups(subset='train')
# x_train = train.data
# y_train = train.target
# test = fetch_20newsgroups(subset='test')
# x_test = test.data
# y_test = test.target

# Applying the Bag of words model

In [4]:
# create countVectorizer instance and transform the dataset
cv = CountVectorizer()
cv_data = cv.fit_transform(x_train)

# create a classification instance and use the cross_valuadation
mul_nb = MultinomialNB()

scores = model_selection.cross_val_score(mul_nb, cv_data, y_train, cv = 5, scoring = 'accuracy')
print("Accuracy: %0.3f" % (scores.mean())) 

Accuracy: 0.838


# Applying the TF-IDF model

In [5]:
# create TfidfVectorizer instance and transform the dataset
tdf_vectorizer = TfidfVectorizer()
tdf_data = tdf_vectorizer.fit_transform(x_train)

scores = model_selection.cross_val_score(mul_nb, tdf_data, y_train, cv = 5, scoring = 'accuracy')
print("Accuracy: %0.3f" % (scores.mean())) 

Accuracy: 0.841


# Exclude the influence of stop words

In [6]:
# get the stop words set from file
def get_stop_words(filename):
    stop_words = set()
    with open(filename, 'r') as fp:
        lines = fp.readlines()
    
    for line in lines:
        stop_words.add(line.strip())
    
    return stop_words

# load the stop words
filename = './stopwords_en.txt'
stop_words = get_stop_words(filename)

# create the td_idf instance and insert the stop_words
vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_train = vectorizer.fit_transform(x_train)

# create a Multinomial Naive Bayes model
mul_nb = MultinomialNB(alpha = 0.01)
scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv = 5, scoring='accuracy') 
print("Accuracy: %0.3f" % (scores.mean())) 

mul_nb.fit(tfidf_train, y_train)

print('Testing result: %0.3f' % (mul_nb.score(vectorizer.transform(x_test) , y_test)))

  'stop_words.' % sorted(inconsistent))


Accuracy: 0.910
Testing result: 0.912
