In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from collections import Counter
from sklearn.manifold import TSNE

import operator
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import math
import matplotlib.pyplot as plt

In [2]:
title = "20_newsgroups"
unique_labels = ['comp.graphics', 'rec.sport.hockey', 'sci.med', 'sci.space', 'talk.politics.misc']

In [3]:
paths = []
labels = []
for l in unique_labels:
    for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())+'/'+title+'/'+str(l)):
        for i in filenames:
            paths.append(str(dirpath)+str("/")+i)
            labels.append(l)

In [4]:
len(paths), len(labels)

(5000, 5000)

In [5]:
def print_doc(id):
    file = open(paths[id], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

In [6]:
def convert_numbers(k):
    for i in range(len(k)):
        try:
            k[i] = num2words(int(k[i]))
        except:
            pass
    return k

In [7]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [8]:
def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\t'), ' ')
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: convert_numbers(x))
    pd = pd.str.join(' ')
    pd = pd.str.replace('[{}]'.format(string.punctuation), ' ')
    
#     pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)])    
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])    
    pd = pd.apply(lambda x: [item for item in x if item not in stop_words])
#     pd = pd.apply(lambda x: [stemmer.stem(y) for y in x])
    return pd

In [9]:
def calc_accuracy(predicted, true):
    tp_tn = len([1 for i in range(len(predicted)) if predicted[i]==true[i]])
    return tp_tn/len(predicted)

In [10]:
def compute_confusion(predicted, true):
    confusion = np.zeros((len(unique_labels), len(unique_labels))).astype(int)
    for i in range(len(predicted)):
        confusion[unique_labels.index(predicted[i])][unique_labels.index(true[i])] += 1
    return confusion

In [11]:
stop_words = stopwords.words('english')

In [12]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [13]:
# doc = 0
# docs = []

# for path in paths:
#     file = open(path, 'r', encoding='cp1250')
#     text = file.read().strip()
#     file.close()
    
#     docs.append(text)
    
#     if doc%1000 == 0:
#         print(doc)

#     doc += 1
# docs_pd = pd.DataFrame([docs, labels]).T
# docs_pd[0] = preprocess(docs_pd[0])
# docs_pd.to_pickle("docs_pd")

In [14]:
docs_pd = pd.read_pickle("docs_pd")

In [15]:
train = docs_pd.sample(frac=0.5,random_state=41)

In [16]:
test = docs_pd.sample(frac=1,random_state=41).drop(train.index)

In [17]:
train = train.reset_index(drop=True)

In [18]:
test = test.reset_index(drop=True)

In [19]:
train.head()

Unnamed: 0,0,1
0,"[xref, cantaloupesrvcscmuedu, scispace60876, s...",sci.space
1,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",sci.med
2,"[xref, cantaloupesrvcscmuedu, talkpoliticsmisc...",talk.politics.misc
3,"[path, cantaloupesrvcscmueducrabapplesrvcscmue...",rec.sport.hockey
4,"[newsgroups, recsporthockeypath, cantaloupesrv...",rec.sport.hockey


In [20]:
test.head()

Unnamed: 0,0,1
0,"[newsgroups, talkpoliticsmiscpath, cantaloupes...",talk.politics.misc
1,"[newsgroups, talkpoliticsmiscpath, cantaloupes...",talk.politics.misc
2,"[xref, cantaloupesrvcscmuedu, mischeadlines417...",talk.politics.misc
3,"[newsgroups, scimedpath, cantaloupesrvcscmuedu...",sci.med
4,"[xref, cantaloupesrvcscmuedu, compinfosystemsg...",comp.graphics


In [21]:
train_class_split = Counter(train[1])

In [22]:
train_class_split

Counter({'sci.space': 499,
         'sci.med': 505,
         'talk.politics.misc': 499,
         'rec.sport.hockey': 513,
         'comp.graphics': 484})

In [23]:
m = {}
for i in range(train.shape[0]):
    try:
        m[train[1][i]] = m[train[1][i]] + train[0][i]
    except:
        m[train[1][i]] = train[0][i]

In [24]:
unique_words = set()
for i in m:
    unique_words = unique_words | set(m[i])
unique_words_count = len(unique_words)

In [25]:
class_frequency = {}
class_count = {}
counter = 0
for i in unique_labels:
    current_count = len(Counter(m[i]))
    class_count[i] = current_count
    counter += current_count
    ll = Counter(m[i])
    for j in ll:
        class_frequency[i, j] = ll[j]

In [26]:
# class_frequency

In [27]:
def get_word_freq(word, label):
    try:
        return class_frequency[label, word], class_count[label]
    except:
        return 0, class_count[label]

In [28]:
len(class_frequency) == counter

True

In [29]:
train_class_split = Counter(train[1])
true = []
predicted = []
for i in range(test.shape[0]):
    true.append(test[1][i])
    classes_words_probability = []
    for l in unique_labels:
        words_probability = 0
        for word in test[0][i]:
            fr, cn = get_word_freq(word, l)
            pp = (fr + 1)/(cn + unique_words_count)
            words_probability += np.log(pp)
        words_probability += np.log(train_class_split[l] / train.shape[0])
        classes_words_probability.append(words_probability)
    predicted.append(unique_labels[np.argmax(classes_words_probability)])

In [30]:
compute_confusion(predicted, true)

array([[501,   0,   9,   6,   1],
       [  1, 483,   6,   4,   5],
       [  3,   0, 451,   1,   0],
       [  5,   2,   8, 482,   4],
       [  6,   2,  21,   8, 491]])

In [31]:
calc_accuracy(predicted, true)

0.9632

# Question 2

In [32]:
percentage = 10

In [33]:
corpus = []
for i in m:
    print(len(m[i]))
    corpus = corpus + m[i]

110804
101130
129503
111274
102056


In [34]:
len(corpus)

554767

In [35]:
DF = {}

n = 0
for tokens in train[0]:
    for w in tokens:
        try:
            DF[w].add(n)
        except:
            DF[w] = {n}
    n += 1
for i in DF:
    DF[i] = len(DF[i])

In [36]:
# DF

In [37]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [38]:
len(corpus)

554767

In [39]:
tf_idf = {}
N = train.shape[0]

counter = Counter(corpus)
words_count = len(corpus)

for token in set(corpus):
    tf = counter[token]/words_count
    df = doc_freq(token)
    idf = np.log((N+1)/(df+1))

    tf_idf[token] = tf*idf


In [40]:
sorted_x = sorted(tf_idf.items(), key=operator.itemgetter(1), reverse=True)

In [41]:
# sorted_x

In [42]:
# refined_data = {}
# for i in sorted_x:
#     try:
#         if len(refined_data[i[0][0]])<topk:
#             refined_data[i[0][0]].add(i[0][1])
#     except:
#         refined_data[i[0][0]] = {i[0][1]}

In [43]:
len(sorted_x)

85789

In [44]:
refined_data = sorted_x[:int(len(sorted_x)*percentage/100)]

In [45]:
refined_data = [i[0] for i in refined_data]

In [46]:
unique_words_count = len(sorted_x[:int(len(sorted_x)*percentage/100)])

In [47]:
class_frequency = {}
class_count = {}
for i in unique_labels:
    ll = Counter(m[i])
    for j in refined_data:
        class_frequency[i, j] = ll[j]
        try:
            class_count[i] = class_count[i] + ll[j]
        except:
            class_count[i] = ll[j]

In [48]:
# class_frequency

In [49]:
class_count['sci.med']

79110

In [50]:
def get_word_freq(word, label):
    try:
        return class_frequency[label, word], class_count[label]
    except:
        return 0, class_count[label]

In [51]:
train_class_split = Counter(train[1])
true = []
predicted = []
for i in range(test.shape[0]):
    true.append(test[1][i])
    classes_words_probability = []
    for l in unique_labels:
        words_probability = 0
        for word in test[0][i]:
            fr, cn = get_word_freq(word, l)
            pp = (fr + 1) / (cn + unique_words_count)
            words_probability += np.log(pp)
        words_probability += np.log(train_class_split[l] / train.shape[0])
        classes_words_probability.append(words_probability)
    predicted.append(unique_labels[np.argmax(classes_words_probability)])

In [52]:
compute_confusion(predicted, true)

array([[511,   1,  12,  20,   8],
       [  0, 482,   3,   1,   5],
       [  2,   1, 474,   4,  27],
       [  3,   1,   5, 475,  16],
       [  0,   2,   1,   1, 445]])

In [53]:
calc_accuracy(predicted, true)

0.9548