## DATA 620 Assignment 5.2 - Document Classification
Team: Andy, Walt, and Nathan
    
This is an attempt to classify the Reuters Corpus of 10,000 news articles in 90 categories, with potential multiple categories per article. The model evaluation was too much for a personal computer to handle. Given the time constraints in the class we opted to use a Corpus with fewer categories (the Brown Corpus), then try to implement the code using a Big Data Solution (such as Spark using Databricks, as example), even though that is the more interesting and practicle solution.

We included this draft of the Jupyter notebook for the sake of completeness. 

In [1]:
import nltk

nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
english_stops = stopwords.words('english')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Exploration of the corpus
print(reuters.fileids()[3010:3030])
print(type(reuters.fileids()))
print(len(reuters.fileids()))

['test/21565', 'test/21567', 'test/21568', 'test/21570', 'test/21571', 'test/21573', 'test/21574', 'test/21575', 'test/21576', 'training/1', 'training/10', 'training/100', 'training/1000', 'training/10000', 'training/10002', 'training/10005', 'training/10008', 'training/10011', 'training/10014', 'training/10015']
<class 'list'>
10788


In [3]:
#Find the training test split
test_fileIds = reuters.fileids()[0:3019]
training_fileIds = reuters.fileids()[3019:]
print(test_fileIds[3013:3019])
print(training_fileIds[0:5])
print(len(test_fileIds)/len(training_fileIds))

['test/21570', 'test/21571', 'test/21573', 'test/21574', 'test/21575', 'test/21576']
['training/1', 'training/10', 'training/100', 'training/1000', 'training/10000']
0.38859570086240186


In [4]:
test_categories = [{fileId:reuters.categories(fileId)} for fileId in test_fileIds]
print(test_categories[100])

train_categories = [{fileId:reuters.categories(fileId)} for fileId in training_fileIds]
print(train_categories[100])

{'test/15023': ['earn']}
{'training/1016': ['earn']}


In [5]:
test_corpus = [{fileId:reuters.words(fileId)} for fileId in test_fileIds]
print(test_corpus[100])

train_corpus = [{fileId:reuters.words(fileId)} for fileId in training_fileIds]
print(train_corpus[100])

{'test/15023': ['CITYTRUST', 'BANCORP', 'INC', '&', 'lt', ';', 'CITR', ...]}
{'training/1016': ['AMERICAN', 'STORES', '&', 'lt', ';', 'ASC', '>', ...]}


In [6]:
#Adapted from https://campus.datacamp.com/courses/natural-language-processing-fundamentals-in-python/

wordnet_lemmatizer = WordNetLemmatizer()

def clean_corpus(corpus):
    cleaned_corpus = []
    for article in corpus:
        dct = {}
        for k,v in article.items():
        
            # Convert the tokens into lowercase: lower_tokens
            lower_tokens = [t.lower() for t in v]
        
            # Retain alphabetic words: alpha_only - not sure if I want to keep.
            alpha_only = [t for t in lower_tokens if t.isalpha()]

            # Remove all stop words: no_stops
            no_stops = [t for t in alpha_only if t not in english_stops]

            # Instantiate the WordNetLemmatizer
            wordnet_lemmatizer = WordNetLemmatizer()

            # Lemmatize all tokens into a new list: lemmatized
            lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
        
            dct[k] = lemmatized
        cleaned_corpus.append(dct)
   
    return cleaned_corpus

clean_train = clean_corpus(train_corpus)
print(train_corpus[100])
print(clean_train[100])
clean_test = clean_corpus(test_corpus)
print(test_corpus[100])
print(clean_test[100])

{'training/1016': ['AMERICAN', 'STORES', '&', 'lt', ';', 'ASC', '>', ...]}
{'training/1016': ['american', 'store', 'lt', 'asc', 'see', 'lower', 'year', 'net', 'american', 'store', 'co', 'said', 'expects', 'report', 'earnings', 'per', 'share', 'dlrs', 'per', 'share', 'sale', 'slightly', 'billion', 'dlrs', 'year', 'ended', 'january', 'supermarket', 'chain', 'earned', 'dlrs', 'per', 'share', 'sale', 'billion', 'dlrs', 'last', 'year', 'company', 'elaborate']}
{'test/15023': ['CITYTRUST', 'BANCORP', 'INC', '&', 'lt', ';', 'CITR', ...]}
{'test/15023': ['citytrust', 'bancorp', 'inc', 'lt', 'citr', 'qtr', 'net', 'shr', 'dlrs', 'v', 'dlrs', 'net', 'v', 'avg', 'shrs', 'v']}


In [7]:
def catagory_corpus(catagories,corpus):
    cat_corp_list = []
    
    for index,dct in enumerate(catagories):
        for k,v in dct.items():
            corp_dct = corpus[index]
            cat_corp_list.append((v,corp_dct[k]))
    return cat_corp_list
            
cat_corp_test = catagory_corpus(test_categories,clean_test)
print(cat_corp_test[100])

cat_corp_train = catagory_corpus(train_categories,clean_train)
print(cat_corp_train[100])


(['earn'], ['citytrust', 'bancorp', 'inc', 'lt', 'citr', 'qtr', 'net', 'shr', 'dlrs', 'v', 'dlrs', 'net', 'v', 'avg', 'shrs', 'v'])
(['earn'], ['american', 'store', 'lt', 'asc', 'see', 'lower', 'year', 'net', 'american', 'store', 'co', 'said', 'expects', 'report', 'earnings', 'per', 'share', 'dlrs', 'per', 'share', 'sale', 'slightly', 'billion', 'dlrs', 'year', 'ended', 'january', 'supermarket', 'chain', 'earned', 'dlrs', 'per', 'share', 'sale', 'billion', 'dlrs', 'last', 'year', 'company', 'elaborate'])


In [8]:
all_words_lower = [w.lower() for w in reuters.words()]
#print(len(all_words_lower))

alpha_only_all_words = [t for t in all_words_lower if t.isalpha()]

            # Remove all stop words: no_stops
no_stops_all_words = [t for t in alpha_only_all_words if t not in english_stops]


            # Lemmatize all tokens into a new list: lemmatized
word_features = [wordnet_lemmatizer.lemmatize(t) for t in no_stops_all_words]

word_features = set(word_features)
#all_words_treated = nltk.FreqDist(lemmatized_all_words)
print(list(word_features)[:10])

['growmark', 'daiwa', 'advisable', 'immunity', 'comtech', 'australia', 'coordinating', 'impression', 'vdo', 'diaz']


In [9]:
category_set = set(reuters.categories())
print(category_set)

{'rubber', 'jobs', 'acq', 'potato', 'heat', 'propane', 'coconut-oil', 'lin-oil', 'wheat', 'veg-oil', 'soy-oil', 'interest', 'naphtha', 'cotton-oil', 'earn', 'coffee', 'meal-feed', 'soy-meal', 'sunseed', 'castor-oil', 'rand', 'hog', 'wpi', 'palm-oil', 'copper', 'ship', 'dfl', 'orange', 'lei', 'silver', 'strategic-metal', 'tin', 'coconut', 'bop', 'carcass', 'instal-debt', 'palmkernel', 'rice', 'lumber', 'cpu', 'copra-cake', 'crude', 'rapeseed', 'tea', 'sun-oil', 'money-supply', 'groundnut', 'corn', 'ipi', 'jet', 'cocoa', 'platinum', 'dlr', 'yen', 'lead', 'groundnut-oil', 'nat-gas', 'nickel', 'dmk', 'barley', 'sorghum', 'gold', 'cotton', 'pet-chem', 'zinc', 'housing', 'money-fx', 'oilseed', 'nkr', 'trade', 'livestock', 'reserves', 'alum', 'gnp', 'gas', 'income', 'palladium', 'oat', 'l-cattle', 'sugar', 'retail', 'soybean', 'cpi', 'rape-oil', 'rye', 'sun-meal', 'grain', 'iron-steel', 'nzdlr', 'fuel'}


In [10]:
def document_features(document):
    doc_set = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document)
    return features

def category_vector(cat_list):
    cat_set = set(cat_list)
    cat_vec = []
    for cat in category_set:
        if cat in cat_set:
            cat_vec.append(1)
        else:
            cat_vec.append(0)
    return str(cat_vec)

#There are a lot of falses, so I am going to print trues only
cat,doc = cat_corp_train[100]
for k,v in document_features(doc).items():
    if v == True:
        print(k,v)
    else:
        continue
print(cat)        
print(category_vector(cat))

cat,doc = cat_corp_test[100]
print(cat) 
print(category_vector(cat))        
        
train_set = [(document_features(d),category_vector(c)) for (c,d) in cat_corp_train]
print(len(train_set))
test_set = [(document_features(d),category_vector(c)) for (c,d) in cat_corp_test]
print(len(test_set))


contains(elaborate) True
contains(store) True
contains(asc) True
contains(lt) True
contains(ended) True
contains(earnings) True
contains(company) True
contains(last) True
contains(year) True
contains(lower) True
contains(share) True
contains(net) True
contains(said) True
contains(chain) True
contains(january) True
contains(per) True
contains(dlrs) True
contains(expects) True
contains(american) True
contains(report) True
contains(slightly) True
contains(billion) True
contains(supermarket) True
contains(sale) True
contains(see) True
contains(earned) True
contains(co) True
['earn']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['earn']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [11]:
print(train_set[100][1])
print(test_set[100][1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [12]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
#classifier = nltk.DecisionTreeClassifier.train(train_set)
#classifier = nltk.MaxentClassifier.train(train_set)

In [15]:
print(classifier.classify(test_set[100][0]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Note that the Classifier was able to classify test_set[100] correctly. The 1 is 15th in the vector, the same as the vector above. However, nltk.classify.accuracy() takes more than 8 hours to run. So we have some evidence that the code works, however evaluting the model seems out of practical reach.
