In [11]:
import pandas as pd
import nltk
import os
import re
from bs4 import BeautifulSoup
import Scraper as scraper
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

<h3> Data Processing</h3>

In [2]:
dataset = pd.read_excel('files/Labeling results.xlsx')

In [3]:
dataset.head()

Unnamed: 0,filename,companyname,year,pos
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0


In [4]:
def extract_MDA(filename):
    # Credits for the scraper package go to GROUP 7
    mda_text = ''
    with open('files/' + filename) as file:
        soup = BeautifulSoup(file, "lxml")
        
        # We first try the scrape by regex method
        try:
            mda_text = scraper.scrapeByRegex(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                return mda_text
        except:
            pass

        # We then try the scrapeByAnchorTag method if the previous method didn;t work
        try:
            mda_text = scraper.scrapeByAnchorTag(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                return mda_text
        except:
            pass

In [5]:
dataset['text'] = dataset.filename.apply(extract_MDA)
dataset.dropna(inplace=True)
dataset.head()

Unnamed: 0,filename,companyname,year,pos,text
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1,Item 7 Management s Discussion and Analys...
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1,Item 7 Management s Discussion and Analysis o...
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0,ITEM 7 MANAGEMENT S DISCUSSION AND AN...
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1,ITEM 7 MANAGEMENT S DISCUSSION AND ANALYSIS...
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0,Item 7 Management s Discussion and Analysis ...


<h3>Naive Bayes Classification</h3>

In [6]:
# We split the dataset in train/test ratio: 0.30
train_set, test_set = train_test_split(dataset, test_size = 0.30)

# We initiate the classifier
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.text.values)
classifier = MultinomialNB(fit_prior="False")                 

In [7]:
# We fit the training set
classifier.fit(counts, train_set.pos.values)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior='False')

In [8]:
# Let's do some prediction on the test set
predictions = classifier.predict(vectorizer.transform(test_set.text.values)) 
test_set_pred = pd.Series(predictions, index=test_set.index)

<h3>Confusion Matrix</h3>

In [9]:
tab = pd.crosstab(test_set.pos, test_set_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) # Print confusion matrix
print(tab)


Predicted  0  1  All
Actual              
0          5  1    6
1          2  1    3
All        7  2    9


<h3>Classification Report</h3>

In [10]:
print(classification_report(test_set.pos, test_set_pred)) # Print accuracy, precision, recall, F measure

             precision    recall  f1-score   support

          0       0.71      0.83      0.77         6
          1       0.50      0.33      0.40         3

avg / total       0.64      0.67      0.65         9

