# Required packages

In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

## nltk
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.datasets import load_files
file_path = './bbc/'
# load_files()
bbc_data = load_files(file_path, encoding='utf-8', decode_error='replace')
type(bbc_data)

sklearn.utils.Bunch

In [3]:
# sanity check of data
for labels, count in bbc_data.items():
    labels, count = np.unique(bbc_data.target, return_counts=True)
    label_names = np.array(bbc_data.target_names)[labels]
    
# display count (frequency) per category
print(dict(zip(label_names, count)))

{'.ipynb_checkpoints': 1, 'business': 510, 'entertainment': 386, 'politics': 417, 'sport': 511, 'tech': 401}


In [4]:
# convert into pandas dataframe
bbc_df = pd.DataFrame({'sentence':bbc_data.data,
                       'labels': bbc_data.target})
bbc_df.head()

Unnamed: 0,sentence,labels
0,Howard dismisses Tory tax fears\n\nMichael How...,3
1,Gritty return for Prince of Persia\n\nStill ba...,5
2,Philippoussis doubt over Open bid\n\nMark Phil...,4
3,Horror film heads US box office\n\nA low-budge...,2
4,Coach Ranieri sacked by Valencia\n\nClaudio Ra...,4


In [5]:
# data shape
print(bbc_df.shape)

(2226, 2)


# Data preprocessing
    - remove special characters
    - remove single character
    - removing single character from start
    - converting multiple spaces into single space
    - removing prefix
    - texts into lower cases
    - stopwords - 'english'

In [6]:
# define function for text processing
def texts_processing(data):
    """
    Function to process the sentence such as special characters,
    single character, prefixes, leading character.
    
    Parameters: texts (not as String).
    Return: Cleanned texts for Text Classification. 
    """
    # empty list to store the processed texts
    corpus_list = list()
    # try with excpetion of TypeError
    try:
        # iterate over the texts in each row
        for texts in range(0, len(data)):
            # remove special characters
            text = re.sub(r'\W', ' ', data['sentence'][texts])
            # remove single character
            text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
            # removing single character from start 
            text = re.sub(r'^[a-zA-Z]\s+', ' ', text)
            # multiple spaces into single spaces
            text = re.sub(r'\s+', ' ', text, flags=re.I)
            # removing prefixes
            text = re.sub(r'^b\s+', '', text)
            # into lower case
            text = text.lower()
            # split into lists
            text = text.split()
            # WordNet
            stemmer = WordNetLemmatizer()
            text = [stemmer.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
            # join text
            text = ' '.join(text)
            # append the list
            corpus_list.append(text)
    except TypeError:
        print("TypeError: list indices must be integers or slices, not string ")
        
    return corpus_list

# sanity check - calling function on raw data with some of well known quotes
raw_data = pd.DataFrame({'sentence':["Love For All, Hatred For None.",
                                  "Change the world by being yourself.",
                                  "Every element is the fresh begining."], 
                         'labels': [0, 1, 2]})
raw_data
# calling the function

corpus = texts_processing(raw_data)
print(corpus)

['love hatred none', 'change world', 'every element fresh begining']


In [7]:
# caling function bbc_df
bbc_corpus = texts_processing(bbc_df)
print(bbc_corpus[:2])

['howard dismisses tory tax fear michael howard dismissed fear conservative plan 4bn tax cut modest defended package saying plan tory first budget hoped able go tory monday highlighted 35bn wasteful spending would stop allow tax cut reduced borrowing spending key service labour liberal democrat say party sum add claim would cut frontline service tory tax plan follows complaint party mp mr howard shadow chancellor oliver letwin taken long unveil proposal promised figure yet reveal tax would targeted tory backbencher edward leigh said proposal step right direction told financial time would come sooner much greater tax cut interviewed bbc radio 2 jeremy vine show mr howard said perfectly true attacked one side people think ought promising much much bigger tax cut spending cut side people say able achieve tax cut think got right mr howard said voter faced clear choice next election waste tax labour tory value money lower tax added would like able time sure able start got recognise limit on

# Bag of Words model