In [1]:
import pandas as pd
train_data = pd.read_excel('./train_data_cleaned.xlsx')
test_data = pd.read_excel('./test_data_cleaned.xlsx')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Split Features Into Input and Target Features
X_train = train_data['summary']
y_train = train_data['label']

### Vectorization

In [3]:
count_vect = CountVectorizer()
count_vect.fit(X_train)
count_vect_transform = count_vect.transform(X_train)
count_vect_transform

<120000x68801 sparse matrix of type '<class 'numpy.int64'>'
	with 2514631 stored elements in Compressed Sparse Row format>

### TF-IDF

In [4]:
tfidf_vect = TfidfTransformer()
X_train_tfidf = tfidf_vect.fit(count_vect_transform)
X_train_tfid_transform = X_train_tfidf.transform(count_vect_transform)

### Build Model Classifier

In [5]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfid_transform, y_train)

In [6]:
from sklearn.metrics import accuracy_score
print(clf.predict(count_vect.transform(['software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80% of medium and large companies have been victims of some form of cyber-crime. bridgehead software has come up with techniques to prove  to a legal standard  that any stored file on a pc has not been tampered with. ironically the impetus for developing the system came as a result of the freedom of information act  which requires companies to store all data for a certain amount of time.  the storage system has been incorporated into an application developed by security firm 3ami which allows every action on a computer to be logged. potentially it could help employers to follow the trail of stolen files and pinpoint whether they had been emailed to a third party  copied  printed  deleted or saved to cd  floppy disk  memory stick or flash card. other activities the system can monitor include the downloading of pornography  the use of racist or bullying language or the copying of applications for personal use. increasingly organisations that handle sensitive data  such as governments  are using biometric log-ins such as fingerprinting to provide conclusive proof of who was using a particular machine at any given time. privacy advocates are concerned that monitoring at work is not only damaging to employee s privacy but also to the relationship between employers and their staff.  that is not the case   said tim ellsmore  managing director of 3ami.  it is not about replacing dialogue but there are issues that you can talk through but you still need proof   he said.  people need to recognise that you are using a pc as a representative of a company and that employers have a legal requirement to store data   he added.','18 percent of Olympic tickets sold in Japan to be refunded', 'LeBron James could play with his son in the NBA after extending contract with Los Angeles Lakers'])))
#hitung akurasi data test
# print("Accuracy:", accuracy_score(train_data.label, predict))

['science-technology news' 'business news' 'sports news']


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
X_test_tf = count_vect.transform(test_data.summary)
X_test_tfidf = tfidf_vect.transform(X_test_tf)

predicted = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(test_data.label, predicted))

Accuracy: 0.9007894736842105


In [8]:
from sklearn.metrics import classification_report
print(classification_report(test_data.label, predicted))

                         precision    recall  f1-score   support

          business news       0.87      0.86      0.86      1900
science-technology news       0.88      0.88      0.88      1900
            sports news       0.95      0.98      0.96      1900
             world news       0.91      0.89      0.90      1900

               accuracy                           0.90      7600
              macro avg       0.90      0.90      0.90      7600
           weighted avg       0.90      0.90      0.90      7600



Kita dapat mengevaluasi performa model klasifikasi dari segi:

1. Precision 
a. Macro Average Precision 
b. Micro Average Precision 
c. Weighted Average Precision

In [9]:
from sklearn.metrics import precision_score
y_true = test_data.label
y_pred = predicted

print("Macro Average Precision:",precision_score(y_true, y_pred, average='macro'))
print("Micro Average Precision:",precision_score(y_true, y_pred, average='micro'))
print("Weighted Average Precision:",precision_score(y_true, y_pred, average='weighted'))

Macro Average Precision: 0.9004470571761125
Micro Average Precision: 0.9007894736842105
Weighted Average Precision: 0.9004470571761124


2. Recall 
a. Macro Average Recall b. Micro Average Recall c. Weighted Average Recall

In [11]:
from sklearn.metrics import recall_score
y_true = test_data.label
y_pred = predicted

print("Macro Average Recall:", recall_score(y_true,y_pred, average='macro'))
print("Micro Average Recall:", recall_score(y_true,y_pred, average='micro'))
print("Weighted Average Recall:", recall_score(y_true, y_pred, average='weighted'))

Macro Average Recall: 0.9007894736842106
Micro Average Recall: 0.9007894736842105
Weighted Average Recall: 0.9007894736842105


3. F1-Score a. Macro Average F1-Score b. Micro Average F1-Score c. Weighted Average F1-Score

In [12]:
from sklearn.metrics import f1_score
y_true = test_data.label
y_pred = predicted

print("Macro Average f1_score:", f1_score(y_true,y_pred,average='macro'))
print("Micro Average f1_score:", f1_score(y_true,y_pred,average='micro'))
print("Weighted Average f1_score:",f1_score(y_true,y_pred,average='weighted'))

Macro Average f1_score: 0.9005477138476727
Micro Average f1_score: 0.9007894736842105
Weighted Average f1_score: 0.9005477138476726


#### An Improved Naïve Bayes Model

In [14]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                ('tfidf', TfidfTransformer()), 
                ('model', MultinomialNB()),])
mod = pipe.fit(train_data.summary, train_data.label)
                 
predict = mod.predict(test_data.summary)

print(classification_report(test_data.label, predicted))

print("accuracy:", accuracy_score(test_data.label, predicted))

                         precision    recall  f1-score   support

          business news       0.87      0.86      0.86      1900
science-technology news       0.88      0.88      0.88      1900
            sports news       0.95      0.98      0.96      1900
             world news       0.91      0.89      0.90      1900

               accuracy                           0.90      7600
              macro avg       0.90      0.90      0.90      7600
           weighted avg       0.90      0.90      0.90      7600

accuracy: 0.9007894736842105


##### Hasil pemodelan dengan pipeline memberikan akurasi yang sama dengan model Gaussian Naive Bayes sebelumnya

### Save model to pickle

In [65]:
import pickle
pickle.dump(count_vect,open('count_vect.pkl', 'wb'))
pickle.dump(clf, open('clf.pkl', 'wb'))