In [1]:
import os, string, re
import glob
import numpy as np
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
ENGLISH_STOP_WORDS = ['my', 'between', 'others', 'more', 'thence', 'none', 'these', 'was', 'which', 'most', 'forty', 'do', 'hasnt', 'for', 'found', 'fifteen', 'anything', 'became', 'fifty', 'being', 'someone', 'sincere', 'latterly', 'somewhere', 'describe', 'up', 'whereas', 'become', 'over', 'system', 'there', 'perhaps', 'only', 'go', 'to', 'yet', 'part', 'the', 'amongst', 'nine', 'hereupon', 'of', 'against', 'under', 'every', 'first', 'however', 'why', 'such', 'nowhere', 'give', 'as', 'around', 'will', 'since', 'name', 'thick', 'yours', 'already', 'see', 'it', 'detail', 'so', 'then', 'anywhere', 'seeming', 'thru', 'behind', 'eight', 'am', 'both', 'whereby', 'hereafter', 'further', 'here', 'must', 'five', 'any', 'moreover', 'after', 'etc', 'also', 'cannot', 'some', 'one', 'once', 'across', 'where', 'formerly', 'well', 'show', 'via', 'from', 'themselves', 'yourselves', 'co', 'nevertheless', 'that', 'next', 'many', 'front', 'due', 'own', 'therefore', 'could', 'very', 'same', 'else', 'several', 'beside', 'alone', 'whenever', 'latter', 'in', 'without', 'because', 'anyone', 'wherein', 'anyhow', 'mill', 'can', 'fire', 'thus', 'though', 'everyone', 'least', 'side', 'how', 'into', 'before', 'thin', 'hundred', 'neither', 'has', 'himself', 'never', 'together', 'inc', 'thereupon', 'a', 'top', 'is', 'noone', 'done', 'fill', 'everywhere', 'onto', 'nobody', 'within', 'always', 'please', 'are', 'at', 'other', 'seemed', 'no', 'an', 'had', 'another', 'amount', 'below', 'he', 'mine', 'along', 'those', 'ourselves', 'each', 'should', 'thereafter', 'interest', 'not', 'this', 'all', 'rather', 'down', 'have', 'herself', 'sometimes', 'afterwards', 'cry', 'four', 'their', 'upon', 'amoungst', 'namely', 'serious', 'whole', 'twelve', 'above', 'meanwhile', 'eg', 'her', 'mostly', 'through', 'un', 'whereupon', 'six', 'find', 'elsewhere', 'beforehand', 'them', 'been', 'full', 'his', 'everything', 'among', 'toward', 'put', 'nor', 'besides', 'even', 'back', 'be', 'we', 'whereafter', 'myself', 'per', 'its', 'whether', 'eleven', 'empty', 'herein', 'de', 'couldnt', 'who', 'hers', 'too', 'whatever', 'and', 'con', 'yourself', 'whose', 'ie', 'indeed', 'throughout', 'might', 'ltd', 'your', 'wherever', 'take', 'sixty', 'sometime', 'she', 'until', 'ours', 'him', 'during', 'thereby', 'otherwise', 'again', 'were', 'if', 'whither', 'cant', 'three', 'i', 'hence', 'when', 're', 'on', 'seems', 'us', 'whoever', 'hereby', 'our', 'you', 'often', 'twenty', 'off', 'two', 'but', 'few', 'although', 'bottom', 'out', 'about', 'than', 'still', 'or', 'becoming', 'towards', 'last', 'whence', 'enough', 'much', 'may', 'itself', 'less', 'almost', 'get', 'except', 'therein', 'third', 'bill', 'me', 'something', 'made', 'ever', 'what', 'while', 'former', 'anyway', 'becomes', 'by', 'beyond', 'keep', 'nothing', 'move', 'with', 'now', 'seem', 'would', 'call', 'ten', 'they', 'either', 'somehow', 'whom']


In [3]:
path_dataset ='data_set'

In [4]:
# Preprocessing step
def basic_processing(text):
    text = re.sub('-{2,}','',text)
    patURL = r"(?:http://|www.)[^\"]+"
    text = re.sub(patURL,'website',text)
    text = re.sub('\.+','.',text)
    text = re.sub('\\s+',' ',text)
    return text

def stemSentence(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

# remove stopword
def remove_stopwords(text):
    stop_words = list(ENGLISH_STOP_WORDS)
    tokens = text.split(" ")
    result = [i for i in tokens if not i in stop_words]
    return " ".join(result)

def clean_doc(text):
    text = stemSentence(text)
    # apply basic preprocessing
    text = basic_processing(text)
    # Remove stop word in the text
    text = remove_stopwords(text)
    # Lower case
    text = text.lower()
    # multiple spaces removal
    text = re.sub(r"\?", " \? ", text)
    # Remove number in the text
    text = re.sub(r"[0-9]+", " ", text)
    # Remove punctuation
    for punc in string.punctuation:
        text = text.replace(punc,' ')
    text = re.sub('\\s+',' ',text)
    
    return text

In [5]:
# Function : read dataset from the folder
def read_data(folder_path):
    documents = []
    labels = []
    for category in os.listdir(folder_path):
        print("Label: ", category)
        path_new = folder_path+ "/"+category + "/*.txt"
        for filename in glob.glob(path_new):
            with open(filename,'r',encoding="utf-8") as file:
              try:
                  content = file.read()
                  documents.append(content)
                  labels.append(category)
              except:
                  print(filename)
    return documents, labels

X_data, y_data = read_data(path_dataset)

Label:  business
Label:  entertainment
Label:  politics
Label:  sport
Label:  tech


In [6]:
# Show the size of dataset
print(len(X_data),len(y_data))

1602 1602


In [7]:
# Show an example
print(X_data[0])
print(y_data[0])

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL

In [8]:
# Apply preprocessing in whole dataset
X_data_preprocess = []
for index,data in enumerate(X_data):
    X_data_preprocess.append(clean_doc(data))

# show one example
print(X_data_preprocess[0])

ad sale boost time warner profit quarterli profit media giant timewarn jump bn £ m month decemb m year earli firm biggest investor googl benefit sale high spe internet connect higher advert sale timewarn said fourth quarter sale rose bn bn profit buoy one off gain offset profit dip warner bro user aol time warner said friday search engin googl internet busi aol ha mix fortun lost subscrib fourth quarter profit lower preced quarter howev compani said aol s underli profit befor item rose stronger internet advertis revenu hope increas subscrib offer onlin servic free timewarn internet custom tri sign aol s exist custom high spe broadband timewarn ha restat result follow probe secur exchang commiss sec close conclud time warner s fourth quarter profit slightli better analyst expect film divis saw profit slump m help box offic flop alexand catwoman sharp contrast year earli final film lord ring trilog boost result full year timewarn post profit bn perform revenu grew bn financi perform wa s

In [9]:
# ngram level tf-idf 
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2,2))
X_train_tfidf = vectorizer.fit_transform(X_data_preprocess)

cv = KFold(n_splits=5, random_state=42, shuffle=True)


In [10]:
# naive_bayes model 
from sklearn.naive_bayes import MultinomialNB
model_naivebayes = MultinomialNB()

# evaluate cross validation 5 folds of Naive Bayes model
scores = cross_val_score(model_naivebayes, X_train_tfidf, y_data, scoring='accuracy', cv=cv)
# report performance
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[0.87850467 0.91900312 0.9625     0.884375   0.903125  ]
Accuracy: 0.910 (0.030)


In [11]:
# K-nearest neighbors model 
from sklearn.neighbors import KNeighborsClassifier

model_knn =  KNeighborsClassifier(n_neighbors=3)
# evaluate cross validation 5 folds of KNeighborsClassifier model
scores = cross_val_score(model_knn, X_train_tfidf, y_data, scoring='accuracy', cv=cv)
# report performance
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[0.91900312 0.90965732 0.925      0.9        0.93125   ]
Accuracy: 0.917 (0.011)


In [12]:
# Decision Tree model 
from sklearn.tree import DecisionTreeClassifier

model_dt =  DecisionTreeClassifier(random_state=42)
# evaluate cross validation 5 folds of KNeighborsClassifier model
scores = cross_val_score(model_dt, X_train_tfidf, y_data, scoring='accuracy', cv=cv)
# report performance
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[0.65732087 0.72274143 0.725      0.759375   0.75625   ]
Accuracy: 0.724 (0.037)


In [13]:
# SVM model 
from sklearn.svm import SVC

model_svm =  SVC(kernel="linear",gamma='auto', C=1.0,probability=True)
# evaluate cross validation 5 folds of SVM model
scores = cross_val_score(model_svm, X_train_tfidf, y_data, scoring='accuracy', cv=cv)
# report performance
print(scores)''
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[0.91588785 0.91900312 0.940625   0.903125   0.946875  ]
Accuracy: 0.925 (0.016)


In [14]:
from sklearn.ensemble import VotingClassifier

soft_ensemble = VotingClassifier(estimators=[('svm', model_svm), ('knn', model_knn), ('nb', model_naivebayes)], voting='soft')

# evaluate cross validation 5 folds of KNeighborsClassifier model
scores = cross_val_score(soft_ensemble, X_train_tfidf, y_data, scoring='accuracy', cv=cv)
# report performance
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[0.95327103 0.95327103 0.95       0.95       0.971875  ]
Accuracy: 0.956 (0.008)


In [15]:
# Because soft_ensemble model has the highest score using cross-validation 5 fold. We will use this model to train data
soft_ensemble.fit(X_train_tfidf, y_data)

VotingClassifier(estimators=[('svm',
                              SVC(gamma='auto', kernel='linear',
                                  probability=True)),
                             ('knn', KNeighborsClassifier(n_neighbors=3)),
                             ('nb', MultinomialNB())],
                 voting='soft')

In [16]:
# Demo with the example in the dataset
input_demo = X_data[0]
input_clean = clean_doc(input_demo)
input_tfidf = vectorizer.transform([input_clean])
predict = soft_ensemble.predict(input_tfidf)
###########################################
print("Input: \n ", input_demo)
print("*"*100)
print("Input Preprocessing: \n ", input_clean)
print("*"*100)
print("Model predict: ", predict[0])
print("True Label: ", y_data[0])

Input: 
  Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to s

In [24]:
# Demo with the input text
input_demo = "A global agreement to end the race to the bottom on corporate taxation is within sight, according to the French and German finance ministers. France's Bruno le Maire told the BBC the G7 club of rich nations were just one millimetre away from a historic agreement on a global minimum rate. He urged low tax states like Ireland to back a deal which would target tech giants such as Amazon and Microsoft. The German finance minister said a 15% rate would help pay back Covid debt Tax on big tech and multi-nationals has been a source of friction between the US and fellow G7 countries such as the UK. German finance minister Olaf Scholz said it was important to stop the world's biggest companies from dodging tax. He said it was absolutely necessary to reach a deal in order to get out of this race to the bottom we see with taxes today…especially after the Covid crisis and all the money we spent to defend the health of the people, and to defend the economy.Mr Le Maire also urged Ireland, which has one of the lowest corporate tax rates in the European Union, at 12.5%, to get on board He added: European countries, that in the past, opposed this new international tax system, must understand that they have to give the agreement to this major breakthrough"
input_clean = clean_doc(input_demo)
input_tfidf = vectorizer.transform([input_clean])
predict = soft_ensemble.predict(input_tfidf)
###########################################
print("Input: \n ", input_demo)
print("*"*100)
print("Input Preprocessing: \n ", input_clean)
print("*"*100)
print("Model predict: ", predict[0])

Input: 
  A global agreement to end the race to the bottom on corporate taxation is within sight, according to the French and German finance ministers. France's Bruno le Maire told the BBC the G7 club of rich nations were just one millimetre away from a historic agreement on a global minimum rate. He urged low tax states like Ireland to back a deal which would target tech giants such as Amazon and Microsoft. The German finance minister said a 15% rate would help pay back Covid debt Tax on big tech and multi-nationals has been a source of friction between the US and fellow G7 countries such as the UK. German finance minister Olaf Scholz said it was important to stop the world's biggest companies from dodging tax. He said it was absolutely necessary to reach a deal in order to get out of this race to the bottom we see with taxes today…especially after the Covid crisis and all the money we spent to defend the health of the people, and to defend the economy.Mr Le Maire also urged Ireland, 

In [18]:
# Demo with the file  text
path_file = "test/test1.txt"
with open(path_file, "r", encoding="utf8") as file:
  input_demo = file.read()
input_clean = clean_doc(input_demo)
input_tfidf = vectorizer.transform([input_clean])
predict = soft_ensemble.predict(input_tfidf)
###########################################
print("Input: \n ", input_demo)
print("*"*100)
print("Input Preprocessing: \n ", input_clean)
print("*"*100)
print("Model predict: ", predict[0])

Input: 
  He added: "The best scientific advice I have at this stage, is that after one jab it's not quite as effective against the new Delta variant, but after both jabs it is.

"So that's why it's so important that we drive through these vaccinations and people come forward for their second jab."

Ministers will "look at the data for another week then make a judgement" on if the lifting of restrictions could proceed as planned, he added.

The government's scientific advisory group for emergencies (Sage) had previously said there was a "realistic possibility" the Delta variant could spread 50% faster.

The Sage documents estimated that if the variant were to be 40-50% more transmissible it "would lead to a substantial resurgence of hospitalisations" and put pressure on the NHS.

Asked whether some measures, such as the wearing of face coverings and working from home, might need to remain in place, Mr Hancock said: "I wouldn't rule that out.

"The way we are looking at this is step fou