In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import numpy as np
import re
from pprint import pprint

In [None]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=13)

In [None]:
def bow(documents, stop=[]):
    '''
    bow: creates a dictionary for assigning making a BOW index assignment
    pars:
    documents -> a text corpus
    stop -> a list (or set) of stopwords
    returns:
    d_bow -> a dictionary mapping a word to it's array position
    '''
    pass


<details>
<summary><b>A solution</b></summary>
<code>
def bow(documents, stop=[]):
    '''
    bow: creates a dictionary for assigning making a BOW index assignment
    pars:
    documents -> a text corpus
    stop -> a list (or set) of stopwords
    returns:
    d_bow -> a dictionary mapping a word to it's array position
    '''
    d_bow, max_ind, stop = {}, 0, set(stop)
    strip_chars = ".?,!;:\"'()"
    rgx = re.compile('[%s]' % strip_chars)
    for doc in documents:
        for word in doc.lower().split():
            word = rgx.sub('', word)
            if word not in d_bow and word not in stop:
                d_bow[word] = max_ind
                max_ind += 1
    return d_bow
</code>
</details>

In [None]:
def use_bow(documents, d_bow):
    '''
    create a document-term matrix based on text corpus
    pars:
    documents -> a text corpus
    d_bow -> a dictionary for a BOW representation of text, indicating the index for a word
    returns:
    an nxp matrix, where n is the number of documents, and p is the number of features in the corpus
    '''
    pass


<details>
<summary><b>A solution</b></summary>
<code>
def use_bow(documents, d_bow):
    '''
    create a document-term matrix based on text corpus
    pars:
    documents -> a text corpus
    d_bow -> a dictionary for a BOW representation of text, indicating the index for a word
    returns:
    an nxp matrix, where n is the number of documents, and p is the number of features in the corpus
    '''
    strip_chars = ".?,!;:\"'()"
    rgx = re.compile('[%s]' % strip_chars)
    data = np.zeros((len(documents), len(d_bow)))
    for i in range(len(documents)):
        for word in documents[i].lower().split():
            word = rgx.sub('', word)
            if word in d_bow:
                data[i, d_bow[word]] = 1 + data[i, d_bow[word]]
    return data

</code>
</details>

In [None]:
d_naive_bow = bow(X_train)
X_train_bow, X_test_bow = use_bow(X_train, d_naive_bow), use_bow(X_test, d_naive_bow)
print(X_train_bow.shape)
#pprint(list(d_naive_bow.items())[:300])

In [None]:
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train_bow, y_train)

In [None]:
accuracy_score(y_test, knn.predict(X_test_bow))