#                                                             HW 6

### Loading of the libraries and Dataset

In [115]:
import pandas as pd
import os
import re
import numpy as np
from bs4 import BeautifulSoup
import Scraper as scraper
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

from nltk.metrics import *
from nltk.classify import NaiveBayesClassifier as NB
from nltk import word_tokenize, FreqDist,classify
import collections

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import itertools
from nltk import word_tokenize, FreqDist, classify, ConditionalFreqDist

In [161]:
dataset = pd.read_excel('files/Labeling results.xlsx')
dataset.head()

Unnamed: 0,filename,companyname,Year,pos
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0


### Extraction of the MDAs

In [4]:
def extract_MDA(filename):
    # Credits for the scraper package go to GROUP 7
    mda_text = ''
    with open('files/' + filename) as file:
        soup = BeautifulSoup(file, "lxml")
        
        # We first try the scrape by regex method
        try:
            mda_text = scraper.scrapeByRegex(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                return mda_text
        except:
            pass

        # We then try the scrapeByAnchorTag method if the previous method didn;t work
        try:
            mda_text = scraper.scrapeByAnchorTag(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                return mda_text
        except:
            pass

In [162]:
dataset['text'] = dataset.filename.apply(extract_MDA)
dataset.dropna(inplace=True)
dataset.head()

Unnamed: 0,filename,companyname,Year,pos,text
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1,Item 7 Management s Discussion and Analys...
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1,Item 7 Management s Discussion and Analysis o...
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0,ITEM 7 MANAGEMENT S DISCUSSION AND AN...
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1,ITEM 7 MANAGEMENT S DISCUSSION AND ANALYSIS...
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0,Item 7 Management s Discussion and Analysis ...


### Methods using Naive Bayes

In [6]:
# We split the dataset in train/test ratio: 0.30
train_set, test_set = train_test_split(dataset, test_size = 0.30)

In [163]:
train_set_words = []

for mda in train_set.text.values:
    words = word_tokenize(mda)
    for word in words:
        stand_word = word.lower() 
        if ((not re.search(r'[0-9]', stand_word)) and (stand_word not in train_set_words)):
                      train_set_words.append(stand_word)

all_words = list(set(train_set_words))
print(len(all_words))
print(all_words[0:50])

5580
[u'four', u'increase', u'granting', u'eligible', u'electricity', u'xto', u'chirally', u'sinking', u'regional', u'dell', u'pigment', u'appropriation', u'bringing', u'internally', u'bioplastics', u'specialties', u'straight', u'specially', u'paperage', u'feasibility', u'second', u'errors', u'contributed', u'bonuses', u'fossil', u'increasing', u'agreement', u'affiliates', u'reprogrammed', u'china', u'affiliated', u'borrowers', u'cyclical', u'k', u'deferring', u'reports', u'military', u'cancellation', u'appropriately', u'classification', u'explained', u'replace', u'brought', u'pound', u'unit', u'derivatives', u'dnl', u'century', u'therefore', u'strike']


### Baseline Model

In [164]:
def mda_features(mda):
    mda_words = word_tokenize(mda.text)
    features = {}
    stand_mda_words = [word.lower() for word in mda_words]
    for word in all_words:
        features['contains({})'.format(word)] = word in mda_words
    return (features, mda.pos)

#### Model training

In [193]:
feature_train_set = train_set.apply(mda_features,axis=1)
classifier = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [195]:
classifier.show_most_informative_features(20)

Most Informative Features
         contains(apply) = True                1 : 0      =      5.9 : 1.0
      contains(improved) = True                1 : 0      =      5.9 : 1.0
         contains(eight) = True                1 : 0      =      5.9 : 1.0
      contains(maturity) = True                1 : 0      =      5.9 : 1.0
    contains(reasonably) = True                1 : 0      =      5.9 : 1.0
      contains(reducing) = True                1 : 0      =      5.3 : 1.0
         contains(ratio) = True                1 : 0      =      5.3 : 1.0
contains(transportation) = True                1 : 0      =      5.3 : 1.0
      contains(unfunded) = True                1 : 0      =      5.3 : 1.0
  contains(successfully) = True                1 : 0      =      4.7 : 1.0
        contains(health) = True                1 : 0      =      4.7 : 1.0
           contains(gas) = True                1 : 0      =      4.7 : 1.0
        contains(expand) = True                1 : 0      =      4.7 : 1.0

In [196]:
test_set_pred = classifier.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.5
             precision    recall  f1-score   support

          0       0.62      0.62      0.62         8
          1       0.25      0.25      0.25         4

avg / total       0.50      0.50      0.50        12



### Improvement number 1: Bigrams

In [201]:
def mda_features_bis(mda):
    mda_words = word_tokenize(mda.text)
    stand_mda_words = [word.lower() for word in mda_words]
    bigram_finder = BigramCollocationFinder.from_words(stand_mda_words)
    score_fn = BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 100)
    return (dict([(ngram, True) for ngram in itertools.chain(stand_mda_words, bigrams)]),mda.pos)

#### Model training

In [202]:
feature_train_set = train_set.apply(mda_features_bis,axis=1)
classifier2 = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [200]:
classifier2.show_most_informative_features(20)

Most Informative Features
              reasonably = True                1 : 0      =      5.9 : 1.0
                    safe = True                1 : 0      =      5.9 : 1.0
                improved = True                1 : 0      =      5.9 : 1.0
                   apply = True                1 : 0      =      5.9 : 1.0
                   eight = True                1 : 0      =      5.9 : 1.0
                computer = True                0 : 1      =      5.4 : 1.0
             disclosures = None                0 : 1      =      5.4 : 1.0
                adjusted = True                1 : 0      =      5.3 : 1.0
                unfunded = True                1 : 0      =      5.3 : 1.0
            registration = True                1 : 0      =      5.3 : 1.0
          transportation = True                1 : 0      =      5.3 : 1.0
                reducing = True                1 : 0      =      5.3 : 1.0
                   ratio = True                1 : 0      =      5.3 : 1.0

In [191]:
test_set_pred = classifier2.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier2, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.333333333333
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         8
          1       0.33      1.00      0.50         4

avg / total       0.11      0.33      0.17        12



### Improvement number 2: Reducing feature space

In [192]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

In [167]:
for row in train_set.iterrows():
    mdatxt = row[1]['text']
    stand_mdatxt = mdatxt.lower()
    mda_words = word_tokenize(mdatxt)
    word_fd.update(mda_words)
    if (row[1]['pos'] ==1):
        label_word_fd['pos'].update(mda_words)
    else:
        label_word_fd['neg'].update(mda_words)
    
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

In [168]:
word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

In [169]:
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:50]
bestwords = set([w for w, s in best])
print bestwords

set([u'Folio', u'End', u'No', u'year', u'Field', u'Ely', u'type', u'Body', u'Begin', u'PRC', u'Company', u'recorded', u's', u'shares', u'payments', u'ASC', u'of', u'Hamilton', u'LLC', u'2006', u'2011', u'2010', u'Kinross', u'stock', u'DHI', u'December', u'Head', u'Mt', u'000', u'PBM', u'during', u'segment', u'properties', u'colindex', u'1997', u'ended', u'1999', u'1998', u'Sequence', u'MH', u'joint', u'Collar', u'2000', u'2001', u'exploration', u'2007', u'2004', u'2005', u'Table', u'the'])


In [170]:
def mda_features_ter(mda):
    mda_words = word_tokenize(mda.text)
    features = {}
    stand_mda_words = [word.lower() for word in mda_words]
    for word in all_words:
        if word in bestwords:
            features['contains({})'.format(word)] = word in mda_words
    return (features, mda.pos)

#### Model training

In [176]:
feature_train_set = train_set.apply(mda_features_ter,axis=1)
classifier3 = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [178]:
classifier3.show_most_informative_features(20)

Most Informative Features
      contains(colindex) = True                1 : 0      =      2.8 : 1.0
       contains(segment) = True                1 : 0      =      2.8 : 1.0
      contains(payments) = False               0 : 1      =      2.5 : 1.0
         contains(joint) = True                0 : 1      =      2.4 : 1.0
         contains(stock) = False               0 : 1      =      1.8 : 1.0
    contains(properties) = True                1 : 0      =      1.7 : 1.0
          contains(type) = True                1 : 0      =      1.7 : 1.0
        contains(shares) = False               1 : 0      =      1.7 : 1.0
       contains(segment) = False               0 : 1      =      1.6 : 1.0
         contains(joint) = False               1 : 0      =      1.4 : 1.0
    contains(properties) = False               0 : 1      =      1.3 : 1.0
          contains(type) = False               0 : 1      =      1.3 : 1.0
      contains(colindex) = False               0 : 1      =      1.3 : 1.0

In [179]:
test_set_pred = classifier3.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier3, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.666666666667
             precision    recall  f1-score   support

          0       0.75      0.75      0.75         8
          1       0.50      0.50      0.50         4

avg / total       0.67      0.67      0.67        12

