In [1]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



In [3]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [5]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['adidas', 'apple', 'hungry', 'kerajaan', 'nike', 'pembangkang', 'thirsty', 'tn50', 'wawasan2020']
28525
28525


In [6]:
# bag-of-word
bow = CountVectorizer().fit_transform(trainset.data)

#tf-idf, must get from BOW first
tfidf = TfidfTransformer().fit_transform(bow)

#hashing, default n_features, probability cannot divide by negative
hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)

In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.82541630149
             precision    recall  f1-score   support

     adidas       0.92      0.74      0.82       309
      apple       0.81      0.87      0.84       501
     hungry       0.84      0.92      0.88      1048
   kerajaan       0.83      0.78      0.81      1412
       nike       0.86      0.81      0.83       292
pembangkang       0.81      0.83      0.82      1500
    thirsty       0.97      0.46      0.63        71
       tn50       0.93      0.52      0.67       211
wawasan2020       0.75      0.98      0.85       361

avg / total       0.83      0.83      0.82      5705



In [8]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.761787905346
             precision    recall  f1-score   support

     adidas       0.95      0.51      0.67       329
      apple       0.98      0.59      0.74       431
     hungry       0.79      0.93      0.86      1054
   kerajaan       0.73      0.79      0.76      1396
       nike       0.94      0.57      0.71       326
pembangkang       0.67      0.87      0.76      1528
    thirsty       1.00      0.04      0.07        54
       tn50       1.00      0.07      0.14       230
wawasan2020       0.93      0.87      0.89       357

avg / total       0.80      0.76      0.74      5705



In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.755477651183
             precision    recall  f1-score   support

     adidas       0.97      0.58      0.73       329
      apple       1.00      0.45      0.62       458
     hungry       0.86      0.91      0.88      1032
   kerajaan       0.73      0.80      0.76      1420
       nike       0.97      0.57      0.72       331
pembangkang       0.62      0.89      0.73      1510
    thirsty       1.00      0.02      0.03        62
       tn50       1.00      0.02      0.04       199
wawasan2020       0.97      0.85      0.90       364

avg / total       0.80      0.76      0.74      5705

