# Importing required package

In [1]:
import os
import re
import time 
import random
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading Data

In [2]:
# list of categories for training model
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.misc', 'comp.graphics', 'sci.space']
news_group_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, remove=['headers', 'footers', 'quotes'], random_state=42)
# display
print("Data target_name:\n", news_group_data.target_names)
print("Data: ", len(news_group_data['data']))

Data target_name:
 ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.politics.misc', 'talk.religion.misc']
Data:  4162


In [3]:
# frequency
target, count_target = np.unique(news_group_data.target, return_counts=True)
target_names = np.array(news_group_data.target_names)
print(list(zip(target_names, count_target)))

[('alt.atheism', 799), ('comp.graphics', 973), ('sci.space', 987), ('talk.politics.misc', 775), ('talk.religion.misc', 628)]


In [4]:
# input and target feature
X, y = news_group_data.data, news_group_data.target
print("Input Data: ", len(X))
print("Target:\n", y[:5])

Input Data:  4162
Target:
 [3 4 1 2 1]


# Data Processing
    - Lemmatizer - removing all the special characters, spaces, character from start, and also converting into lower case

In [5]:
# define lemmatization function
def lemmatize_words(document):
    """
    Function to use nltk lemmatization using regex module.
    Args:
    Sentence: document containing Sentence (str)
    Returns:
    Document: containing sentence after using lemmatization. 
    """
    # create an empty list
    input_document = list()
    # initialise WordNetLemmatizer class
    stemmer = WordNetLemmatizer()
    # iterating over data
    for texts in range(0, len(document)):
        # remove special character
        input_doc = re.sub(r'\W', ' ', str(document[texts]))
        # remove single character
        input_doc = re.sub(r'\s+[a-zA-Z]\s+', ' ', input_doc)
        # removing single character from start
        input_doc = re.sub(r'\^[a-zA-Z]\s+', ' ', input_doc)
        # multiple spaces into single spaces
        input_doc = re.sub(r'\s+', ' ', input_doc, flags=re.I)
        # removing prefixes
        input_doc = re.sub(r'^b\s+', '', input_doc)
        # into lower case
        input_doc = input_doc.lower()
        input_doc = input_doc.split()
        input_doc = [stemmer.lemmatize(text) for text in input_doc]
        input_doc = ' '.join(input_doc)
        # append list
        input_document.append(input_doc)
        
    return input_document
        
# calling function
X = lemmatize_words(X)
# sanity check
print(X[:1])

['peter nelson posted very eloquent response to this point in talk politics misc so need not consume more bandwidth here what meant if it wa not clear wa the intersection set of liberal and libertarian philosophy of natural right and how the government and constitutional interpretation in particular fit into that philosophy this philosophy engages in the very serious practical error of endowing the nine lawyer on the supreme court with an almost totalitarian authority completely outside of the consent or consensus of the people this is why supreme court nomination are such amazing political fist fight these day because he who control the court rule the country the people on the court may well be trying to do the best job they can but they are at best benevolent oligarch even if you approve of every supreme court decision ever eventually an oligarch will arise that will decimate that which you hold dear try supreme court case by jury and the problem would be mitigated great deal those w

# Stopwrods and Tokenizer

In [6]:
# define tokenizer function
def tokenize_after_removing_stopwords(document):
    """
    Function to split the sentence into individual words.
    Args:
    Sentence: document containing Sentence (str)
    Returns:
    List of split individual words (Token) after removing stopwords
    """
    # create an empty list
    filtered_document = list()
    # initialize nltk.corpus.stopwrods
    stop_words = set(stopwords.words('english'))
    # initialize tokenizer
    tokenizer  = word_tokenize(document)
    document = [word for word in tokenizer if not word in stop_words]
    # append list
    filtered_document.append(document)
    
    return filtered_document

# calling function
X = tokenize_after_removing_stopwords(X)
print(X[:1])

TypeError: expected string or bytes-like object