# Import libraries

In [1]:
# Library for data manipulation
import pandas as pd

# os library is used here for file path locatain
import os

# numpy is used for numerical operation
import numpy as np

from nltk import word_tokenize, pos_tag
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

from collections import OrderedDict

import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\c2034122\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\c2034122\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\c2034122\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Preparing dataset

In [2]:
# In this piece of code, we extract the data from text file and put it in columns

# "bbc" is the folder of the dataset
directory = 'bbc'

# we created a dataframe with two columns "news" that is text related to business, tech etc and a category for text
data_frame = pd.DataFrame(columns=['news', 'category'])

id_count = 0

# In the below code the for loop will go through the text files and get the text from each file and put it in dataframe column.
# the folders name is tech, business etc. and it is pushed to "news" columns
for subdir, dirs, files in os.walk(directory):
    for file in files:
        if not file.startswith("READ"):
            category = subdir.replace('bbc\\','')
            
            with open(os.path.join(subdir, file)) as f:
                data = " ".join(line.rstrip() for line in f)
                data_frame = data_frame.append(pd.DataFrame([[data, category]], columns=['news', 'category']), ignore_index = True)
                f.close()

In [3]:
#data_frame.to_csv('Prepared_data.csv')

In [4]:
## only to upload csv file to colab
#from google.colab import files
#uploaded = files.upload()

In [5]:
data_frame = pd.read_csv('Prepared_data.csv')

# Data preprocessing

In [6]:
# Associate Category names with numerical index and save it in new column category_id
data_frame['category_id'] = data_frame['category'].factorize()[0]

# map the categories_name to number
category_id_df = data_frame[['category', 'category_id']].drop_duplicates().sort_values('category_id')

In [7]:
# insert ID column
data_frame['ID'] = range(0, len(data_frame))

# set ID as an index
data_frame = data_frame.set_index('ID')

# Dropping the category column. here we can also use the drop function
data_frame = data_frame[['news', 'category_id']]
data_frame.head()

Unnamed: 0_level_0,news,category_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Ad sales boost Time Warner profit Quarterly p...,0
1,Dollar gains on Greenspan speech The dollar h...,0
2,Yukos unit buyer faces loan claim The owners ...,0
3,High fuel prices hit BA's profits British Air...,0
4,Pernod takeover talk lifts Domecq Shares in U...,0


In [8]:
# shuffle the dataframe for machine learning model
from sklearn.utils import shuffle
data_frame = shuffle(data_frame)

In [9]:
data_frame.head()

Unnamed: 0_level_0,news,category_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
640,Franz man seeks government help Franz Ferdina...,1
718,US show sued for rat-eating stunt A US TV net...,1
1060,Police urge pub closure power New powers are ...,2
495,Europe asks Asia for euro help European leade...,0
190,US regulator to rule on pain drug US food and...,0


In [10]:
# here we will remove the garbage data that will not be helpful to our model
def data_preprocessing(news):
    # Replace new line with null if any exist also remove the carriage return
    news = news.lower().replace('\n', ' ').replace('\r', '').strip()
    # to remove multiple spaces we will be using a regex expression. it will replace
    # multiple spaces with single space
    news = re.sub(' +', ' ', news)
    # to get alphabets
    news = re.sub(r'[^\w\s]', '', news)
    
    # we will be removing stopwords like and, or etc.
    stop_words = set(stopwords.words('english'))
    # list of words include in news article
    word_tokens = word_tokenize(news)
    # this will remove all unnecessary words specified in nltk stopwords file
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    news = ' '.join(filtered_sentence)
    return news

data_frame['news'] = data_frame['news'].apply(data_preprocessing)

# Feature engineering

In [11]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

data_frame['news_noun_adj'] = data_frame.news.apply(nouns_adj)

In [12]:
data_frame['news'] = data_frame['news'] + data_frame['news_noun_adj']

In [13]:
# Removing duplicate word from a string.
data_frame['news'] = data_frame.news.str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')

In [14]:
# dictionary for category and their ID.
# we need it later
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(data_frame.news).toarray() 

labels = data_frame.category_id

In [16]:
features.shape

(2225, 11719)

In [17]:
data_frame.head()

Unnamed: 0_level_0,news,category_id,news_noun_adj
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
640,franz man seeks government help ferdinand fron...,1,franz man government help ferdinand frontman a...
718,us show sued rateating stunt tv network 25m â1...,1,sued stunt network â13m viewer contestants dea...
1060,police urge pub closure power new powers neede...,2,police pub closure power new powers disorderly...
495,europe asks asia euro help european leaders sa...,0,europe asia euro help european leaders asian s...
190,us regulator rule pain drug food regulators de...,0,rule drug food drug regulators recommend sale ...


# Feature selection

In [18]:
# Importing chi-square for feature selection
from sklearn.feature_selection import chi2

# number of words in a perticular set
Num_of_words = 10

# Find correlation between words and categories
for Category, category_id in sorted(category_to_id.items()):
    # apply chi-square to all features in categories.
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    
    # list of top unigrams with chi-square.
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    
    # list of top bigrams with chi-square.
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    
    # display the category
    print("'{}':".format(Category))
    
    # display top 10 unigrams
    print("  . unigrams: {}".format(', '.join(unigrams[-Num_of_words:])))
    
    # display top 10 bigrams
    print("  . bigrams: {}".format(', '.join(bigrams[-Num_of_words:])))

'business':
  . unigrams: economic, prices, investors, stock, economy, analysts, oil, bank, growth, shares
  . bigrams: analysts said, stock exchange, securities exchange, exchange commission, told reuters, economic growth, news agency, chief executive, oil prices, stock market
'entertainment':
  . unigrams: stars, starring, album, award, comedy, awards, actress, singer, actor, film
  . bigrams: golden globe, leonardo dicaprio, film festival, named best, imelda staunton, vera drake, dollar baby, million dollar, box office, los angeles
'politics':
  . unigrams: leader, tony, liberal, secretary, party, blair, tories, tory, election, labour
  . bigrams: tory leader, radio 4s, liberal democrat, leader michael, lib dems, prime minister, liberal democrats, michael howard, general election, tony blair
'sport':
  . unigrams: rugby, victory, champion, win, game, season, injury, match, cup, coach
  . bigrams: coach andy, sir alex, manchester united, rbs nations, australian open, world cup, world

# Machine learning modeling

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# we are using Decision tree classifier
model_dec_tree = DecisionTreeClassifier(random_state=1)
model_log_reg = LogisticRegression(random_state=1)
model_naive = GaussianNB()
model_svc = SVC(random_state=1)

#Split Data 
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_frame.index, test_size=0.20, random_state=2)

#Train model
model_dec_tree.fit(X_train, y_train)
model_log_reg.fit(X_train, y_train)
model_naive.fit(X_train, y_train)
model_svc.fit(X_train, y_train)

# Make Predictions
y_pred_dec_tree = model_dec_tree.predict(X_test)
y_pred_log_reg = model_log_reg.predict(X_test)
y_pred_naive = model_naive.predict(X_test)
y_pred_svc = model_svc.predict(X_test)


In [20]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy of Decision tree: ", accuracy_score(y_test, y_pred_dec_tree))
print("Accuracy of Logistic regression: ", accuracy_score(y_test, y_pred_log_reg))
print("Accuracy of Naive bayes: ", accuracy_score(y_test, y_pred_naive))
print("Accuracy of SVC: ", accuracy_score(y_test, y_pred_svc))

Accuracy of Decision tree:  0.7955056179775281
Accuracy of Logistic regression:  0.9707865168539326
Accuracy of Naive bayes:  0.9235955056179775
Accuracy of SVC:  0.9752808988764045


In [21]:
print('Classification report for Decision Tree')
print(classification_report(y_test, y_pred_dec_tree))
print('Classification report for Logistic regression')
print(classification_report(y_test, y_pred_log_reg))
print('Classification report for Naive bayes')
print(classification_report(y_test, y_pred_naive))
print('Classification report for SVC')
print(classification_report(y_test, y_pred_svc))

Classification report for Decision Tree
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        97
           1       0.84      0.85      0.84        80
           2       0.70      0.69      0.69        80
           3       0.85      0.85      0.85       100
           4       0.81      0.78      0.80        88

    accuracy                           0.80       445
   macro avg       0.79      0.79      0.79       445
weighted avg       0.80      0.80      0.80       445

Classification report for Logistic regression
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        97
           1       0.97      0.95      0.96        80
           2       0.95      0.95      0.95        80
           3       1.00      0.99      0.99       100
           4       0.99      0.97      0.98        88

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97  

In [24]:
# Decision tree: testing the model on user input data
text_features = tfidf.transform([input('Input news:')])
prediction = model_dec_tree.predict(text_features)

print("Decision tree Predicted as: ", id_to_category[prediction[0]])

Input news:FA Cup: Watch all Leicester's goals from their journey to the semi-finals
Decision tree Predicted as:  sport


In [23]:
# Logistic regression: testing the model on user input data
text_features = tfidf.transform([input('Input news:')])
prediction = model_log_reg.predict(text_features)

print("Logistic regression Predicted as: ", id_to_category[prediction[0]])

Input news:Prince Philip funeral: William and Harry seen chatting after ceremony
Logistic regression Predicted as:  entertainment
