# Code borrowed from w207 # Project 2: Topic Classification

In this project, you'll work with text data from newsgroup postings on a variety of topics. You'll train classifiers to distinguish between the topics based on the text of the posts. Whereas with digit classification, the input is relatively dense: a 28x28 matrix of pixels, many of which are non-zero, here we'll represent each document with a "bag-of-words" model. As you'll see, this makes the feature representation quite sparse -- only a few words of the total vocabulary are active in any given document. The bag-of-words assumption here is that the label depends only on the words; their order is not important.

The SK-learn documentation on feature extraction will prove useful:
http://scikit-learn.org/stable/modules/feature_extraction.html

Each problem can be addressed succinctly with the included packages -- please don't add any more. Grading will be based on writing clean, commented code, along with a few short answers.

As always, you're welcome to work on the project in groups and discuss ideas on the course wall, but please prepare your own write-up and write your own code.

In [1]:
#preprocessing:
# filter out the ones that start with RT retweets
# remove empty lines
# cat notbot.csv.orig | grep -v "\"RT @" | grep -v "\,RT @" | grep -v '^[[:space:]]*$' |

In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *



In [3]:
import csv
import string


num_total_words = num_bot_words = num_notbot_words = 0
num_bot_docs = num_notbot_docs = 0
unique_words = {}
wordcount = {}
bot_test = []
bot_train = []
notbot_test = []
notbot_train= []

percent_test = 0.20
csvfile = open('bot.csv')
readCSV = csv.reader(csvfile)
num_bot_docs = sum([1 for row in readCSV])
num_bot_train = int(num_bot_docs * (1-percent_test))
csvfile = open('notbot.csv')
readCSV = csv.reader(csvfile)
num_notbot_docs = sum([1 for row in readCSV])
num_notbot_train = int(num_notbot_docs * (1-percent_test))

############### process bot text ############################33
csvfile = open('bot.csv')
readCSV = csv.reader(csvfile)
ctr = 0

for row in readCSV :
    if ctr is 0 :
        ctr += 1
    else :
        ctr += 1
        if ctr > num_bot_train :
            bot_test.append(row[3].lower())
        else :
            bot_train.append(row[3].lower())
            words = [x.strip(string.punctuation) for x in row[3].lower().split()]
            num_total_words += len(words)
    
            for word in words :
                if word is not '' :
                    num_bot_words += 1
                    unique_words[word] = 1
                    if word not in wordcount :
            
                        # first is notbot, 2nd is bot
                        wordcount[word] = {0:0, 1:0}
                    wordcount[word][1] += 1
        
csvfile.close()


print (len(unique_words), num_bot_train, num_bot_words, len(bot_test))
############### process not bot text ############################


csvfile = open('notbot.csv')
readCSV = csv.reader(csvfile)
ctr = 0

for row in readCSV :
    if ctr is 0 :
        ctr += 1
    else :
        ctr += 1
        if len(row) is 11 :
          if ctr > num_notbot_train :
            notbot_test.append(row[3].lower())
          else :
            notbot_train.append(row[3].lower())
            words = [x.strip(string.punctuation) for x in row[3].lower().split()]
            num_total_words += len(words)
    
            for word in words :
                if word is not '' :
                    num_notbot_words += 1
                    unique_words[word] = 1
                    if word not in wordcount :
            
                         # first is notbot, 2nd is bot
                        wordcount[word] = {0:0, 1:0}
                    wordcount[word][0] += 1
        
csvfile.close()

print (len(unique_words), num_notbot_train, num_notbot_words, len(notbot_test))


(6400, 1200, 13384, 301)
(8316, 395, 6207, 86)


In [4]:
f = open('model.txt', 'w')

# here we calculate the conditional probability P(word|class=notbot) and P(word|class=bot)
# P(word|class) = #occurrences of that word in the class / #total number of words in the class
for k,v in wordcount.items():
    condprob_termnotbot = (v[0] * 1.00000000000000) / num_notbot_words
    condprob_termbot = (v[1] * 1.00000000000000) / num_bot_words
    f.write("%s\t%d,%d,%0.10f,%0.10f\n" % (k,v[0],v[1], condprob_termnotbot, condprob_termbot))

#print out the Class Priors
notbot_prior = num_notbot_docs * 1.0000000000000 / (num_notbot_docs + num_bot_docs)
bot_prior = num_bot_docs * 1.000000000000 / (num_bot_docs + num_notbot_docs)
f.write("%s\t%d,%d,%0.10f,%0.10f\n" % ("ClassPriors", num_notbot_docs, num_bot_docs, notbot_prior, bot_prior))
f.close()


In [5]:
from math import log
from math import exp

class NaiveBayesModel(object):

    def __init__(self, modelFile):
        self.model = {}
        recordStrs = [s.strip().split('\n')[0].split('\t') for s in open(modelFile).readlines()]
        for word, statsStr in recordStrs:
            nword = re.sub(' ', '', word)
            self.model[nword] = map(float, statsStr.split(","))
        #Class priors: counts and probs (Pr(Class =0) and Pr(Class =1))
        self.c0, self.c1, self.prClass0, self.prClass1 = map(float, self.model["ClassPriors"])

        

    def classify(self, doc):
        # Posterior Probabilities Pr(Class=0| Doc) and Pr(Class=1| Doc) 
        # Naive Bayes inference Pr(Class=0| Doc)  ~ Pr(Class=0) * Pr(Class=0| word1) * Pr(Class=0| word2)...... 
        PrClass0GivenDoc = self.prClass0  
        PrClass1GivenDoc = self.prClass1
        for word in doc:
            PrClass0GivenDoc *= self.model[word][2]
            PrClass1GivenDoc *= self.model[word][3]
        return([PrClass0GivenDoc, PrClass1GivenDoc])
 
    # the natural log based version of this 
    # helps avoid underflow issues
    def classifyInLogs(self, doc):       
        # Posterior Probabilities Pr(Class=0| Doc) and Pr(Class=1| Doc) 
        # Naive Bayes inference Pr(Class=0| Doc)  ~ Pr(Class=0) * Pr(Class=0| word1) * Pr(Class=0| word2)...... 
        PrClass0GivenDoc = log(self.prClass0)  
        PrClass1GivenDoc = log(self.prClass1)
        for word in doc:  #NOTE: Improvement: on loading one should convert probs to log probs!
            if word in self.model :
                c0 = self.model[word][2]
                c1 = self.model[word][3]
                if c0 != 0:
                    PrClass0GivenDoc += log(c0)
                else:
                    PrClass0GivenDoc = float("-inf")
                if c1 != 0:
                   PrClass1GivenDoc += log(c1)
                else:
                    PrClass1GivenDoc = float("-inf")

        return([PrClass0GivenDoc, PrClass1GivenDoc])

        
    def printModel(self):
        print "NaiveBayes Model starts here\n----------------"
        print "PRIORS: prClass0=%04.3f, prClass1=%04.3f" % (self.prClass0, self.prClass1)
        for word, stats in self.model.items():
            print "Pr(",word, "| Class)", stats  #Pr(Class=0| Doc)  all stats
        print "NaiveBayes Model ENDS here\n----------------"
 
 

In [6]:

NBModel = NaiveBayesModel("model.txt") 
f = open('output.txt', 'w')

#prediction on test set
y_true = []
y_pred = []
for sent in notbot_test :
    words = [x.strip(string.punctuation) for x in sent.split()]
    words = [w for w in words if w is not '']
    notbot_probability, bot_probability = NBModel.classifyInLogs(words)
    #print (notbot_probability, bot_probability)
    y_true.append(0)
    predicted = 0
    if bot_probability > notbot_probability :
        predicted = 1
        f.write("INCORRECT (true=notbot): " + sent+  "\n")
    else :
        f.write("CORRECT (true=notbot): " + sent+  "\n")
    y_pred.append(predicted)
    
print (len(y_true), len(y_pred))

for sent in bot_test :
    words = [x.strip(string.punctuation) for x in sent.split()]
    words = [w for w in words if w is not '']
    notbot_probability, bot_probability = NBModel.classifyInLogs(words)
    y_true.append(1)
    predicted = 0
    if bot_probability > notbot_probability :
        predicted = 1
        f.write("CORRECT (true=bot):" + sent+  "\n")
    else :
        f.write("INCORRECT (true=bot):" + sent +  "\n")
    y_pred.append(predicted)
    
    #print (notbot_probability, bot_probability)
print (len(y_true), len(y_pred))
print ("F1 score is : ", metrics.f1_score(y_true, y_pred, average='binary'))
print ("Accuracy score is : ", metrics.accuracy_score(y_true, y_pred))
print ("Recall score is : ", metrics.recall_score(y_true, y_pred))

f.close()


(86, 86)
(387, 387)
('F1 score is : ', 0.84962406015037606)
('Accuracy score is : ', 0.79328165374677007)
('Recall score is : ', 0.75083056478405319)


In [7]:
#===========================================================================================================