# HW 6

### Loading of the libraries and Dataset

In [7]:
import pandas as pd
import os
import re
import numpy as np
from bs4 import BeautifulSoup
import Scraper as scraper
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.metrics import *
from nltk.classify import NaiveBayesClassifier as NB
from nltk import word_tokenize, FreqDist,classify, ConditionalFreqDist, pos_tag
from nltk.collocations import BigramCollocationFinder

import collections
import itertools

Apart from the 40 10K documents of last week, we ADDED 29 extra 10K documents in order to get a bigger dataset. In this case, we used the stock price annual return value of the 10K file in order to determine the sentiment [positive/negative]. If the annual return is negative, then the sentiment will be accordingly negative, and viceversa.

In [8]:
dataset = pd.read_excel('files/Labeling results.xlsx')
dataset.head()

Unnamed: 0,filename,companyname,year,pos
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0


### Extraction of the MDAs

In [10]:
def extract_MDA(filename):
    # Credits for the scraper package go to GROUP 7
    mda_text = ''
    with open('files/' + filename) as file:
        soup = BeautifulSoup(file, "lxml")
        
        # We first try the scrape by regex method
        try:
            mda_text = scraper.scrapeByRegex(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                mda_text = re.sub("\d+","",mda_text)
                return mda_text
        except:
            pass

        # We then try the scrapeByAnchorTag method if the previous method didn;t work
        try:
            mda_text = scraper.scrapeByAnchorTag(soup)
            if mda_text:
                mda_text = BeautifulSoup(mda_text, "html.parser").get_text()
                mda_text = re.sub('[^\w]', ' ', mda_text)
                mda_text = re.sub("\d+","",mda_text)
                return mda_text
        except:
            pass

In [11]:
dataset['text'] = dataset.filename.apply(extract_MDA)
dataset.dropna(inplace=True)
dataset.head()

Unnamed: 0,filename,companyname,year,pos,text
0,0000950123-10-115038.txt,American Pacific Corporation,2010,1,Item Management s Discussion and Analysi...
1,0000950129-06-002726.txt,FREMONT GENERAL CORP,2005,1,Item Management s Discussion and Analysis of...
2,0000950134-09-004608.txt,UNITED STATES LIME & MINERALS INC,2008,0,ITEM MANAGEMENT S DISCUSSION AND ANA...
3,0000950152-07-001610.txt,LUBRIZOL CORP,2006,1,ITEM MANAGEMENT S DISCUSSION AND ANALYSIS ...
4,0000807397-98-000073.txt,MARK SOLUTIONS INC,1998,0,Item Management s Discussion and Analysis o...


### Methods using Naive Bayes

In [12]:
# We split the dataset in train/test ratio: 0.30
train_set, test_set = train_test_split(dataset, test_size = 0.30)

In [13]:
train_set_words = []

for mda in train_set.text.values:
    words = word_tokenize(mda)
    for word in words:
        stand_word = word.lower() 
        train_set_words.append(stand_word)

all_words = list(set(train_set_words))
print(len(all_words))
print(all_words[0:50])

6199
[u'issuances', u'four', u'payoff', u'increase', u'granting', u'eligible', u'electricity', u'xto', u'chirally', u'sinking', u'regional', u'dell', u'pigment', u'hdtv', u'appropriation', u'bringing', u'internally', u'bioplastics', u'specialties', u'reliable', u'specially', u'paperage', u'feasibility', u'second', u'issuable', u'errors', u'contributed', u'bonuses', u'fossil', u'increasing', u'inducement', u'affiliates', u'reported', u'china', u'affiliated', u'borrowers', u'cyclical', u'k', u'deferring', u'reports', u'military', u'cancellation', u'appropriately', u'classification', u'explained', u'replace', u'brought', u'sizeable', u'unit', u'derivatives']


### Baseline Model

For the baseline model, we just create a bag of words for our documents.

In [14]:
def mda_features(mda):
    mda_words = word_tokenize(mda.text)
    features = {}
    stand_mda_words = [word.lower() for word in mda_words]
    for word in all_words:
        features['contains({})'.format(word)] = word in mda_words
    return (features, mda.pos)

#### Model training

In [15]:
feature_train_set = train_set.apply(mda_features,axis=1)
classifier = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [16]:
classifier.show_most_informative_features(20)

Most Informative Features
          contains(home) = True                1 : 0      =      8.3 : 1.0
       contains(options) = False               0 : 1      =      8.2 : 1.0
        contains(unpaid) = True                0 : 1      =      8.2 : 1.0
       contains(arising) = True                1 : 0      =      7.5 : 1.0
contains(interpretation) = True                1 : 0      =      7.5 : 1.0
        contains(hedges) = True                1 : 0      =      7.5 : 1.0
       contains(varying) = True                1 : 0      =      6.7 : 1.0
         contains(forma) = True                1 : 0      =      6.7 : 1.0
      contains(allocate) = True                1 : 0      =      6.7 : 1.0
       contains(weather) = True                1 : 0      =      6.7 : 1.0
     contains(estimable) = True                1 : 0      =      6.7 : 1.0
      contains(software) = True                0 : 1      =      5.9 : 1.0
contains(transportation) = True                1 : 0      =      5.9 : 1.0

In [17]:
test_set_pred = classifier.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.619047619048
             precision    recall  f1-score   support

          0       0.58      0.70      0.64        10
          1       0.67      0.55      0.60        11

avg / total       0.63      0.62      0.62        21



In [18]:
tab = pd.crosstab(test_set.pos, pd.DataFrame({'Prediction':test_set_pred}).Prediction, rownames=['Actual'], colnames=['Predicted'],margins=True) # Print confusion matrix
print(tab)

Predicted  0.0  1.0  All
Actual                  
0.0          5    1   10
1.0          1    0   11
All         12    9   35


### Improvement number 1: Bigrams

In [26]:
def mda_features_bis(mda):
    mda_words = word_tokenize(mda.text)
    stand_mda_words = [word.lower() for word in mda_words]
    stand_mda_words = list(set(stand_mda_words))
    bigram_finder = BigramCollocationFinder.from_words(stand_mda_words)
    score_fn = BigramAssocMeasures.chi_sq
    bigrams = bigram_finder.nbest(score_fn, 50)
    return (dict([(ngram, True) for ngram in itertools.chain(stand_mda_words, bigrams)]),mda.pos)

#### Model training

In [27]:
feature_train_set = train_set.apply(mda_features_bis,axis=1)
classifier2 = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [28]:
classifier2.show_most_informative_features(20)

Most Informative Features
          interpretation = True                1 : 0      =      8.3 : 1.0
                  unpaid = True                0 : 1      =      8.2 : 1.0
                 options = None                0 : 1      =      7.6 : 1.0
                 arising = True                1 : 0      =      7.5 : 1.0
                  hedges = True                1 : 0      =      7.5 : 1.0
                 weather = True                1 : 0      =      6.7 : 1.0
                     fin = True                1 : 0      =      6.7 : 1.0
                   forma = True                1 : 0      =      6.7 : 1.0
                  annual = None                1 : 0      =      6.7 : 1.0
                 varying = True                1 : 0      =      6.7 : 1.0
               estimable = True                1 : 0      =      6.7 : 1.0
                allocate = True                1 : 0      =      6.7 : 1.0
               additions = True                0 : 1      =      6.5 : 1.0

In [29]:
test_set_pred = classifier2.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier2, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.47619047619
             precision    recall  f1-score   support

          0       0.48      1.00      0.65        10
          1       0.00      0.00      0.00        11

avg / total       0.23      0.48      0.31        21



In [30]:
tab = pd.crosstab(test_set.pos, pd.DataFrame({'Prediction':test_set_pred}).Prediction, rownames=['Actual'], colnames=['Predicted'],margins=True) # Print confusion matrix
print(tab)

Predicted  0.0  All
Actual             
0.0          6   10
1.0          1   11
All         21   35


### Improvement number 2: Reducing feature space

In [31]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

In [32]:
for row in train_set.iterrows():
    mdatxt = row[1]['text']
    stand_mdatxt = mdatxt.lower()
    mda_words = word_tokenize(mdatxt)
    word_fd.update(mda_words)
    if (row[1]['pos'] ==1):
        label_word_fd['pos'].update(mda_words)
    else:
        label_word_fd['neg'].update(mda_words)
    
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

In [33]:
word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

In [36]:
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:100]
bestwords = set([w for w, s in best])
print bestwords

set([u'ADD_GRID', u'photo', u'ended', u'paper', u'kiln', u'SFAS', u'PRC', u'Company', u'revenue', u'Gas', u'real', u'increased', u'Augusta', u'joint', u'royalty', u'during', u'colindex', u'WIDTH', u'Advanced', u'End', u'Lubrizol', u'gutter', u'rate', u'year', u'END', u'Type', u'BEGIN', u'lead', u'bottom', u'recorded', u'shares', u'PAGEBREAK', u'Hamilton', u'LLC', u'During', u'Kinross', u'common', u'body', u'Term', u'Natural', u'K', u'million', u'Mt', u'COMMAND', u'loans', u'properties', u'of', u'MH', u'US', u'Pagebreak', u'Arabic', u'No', u'Fremont', u'Ely', u'Arkansas', u'lime', u'Interests', u'RMB', u'pt', u'ASC', u'type', u'medium', u'corrugating', u'gas', u'volume', u'R', u'Noveon', u'solid', u'XBRL', u'project', u'printing', u'exploration', u'venture', u'December', u'property', u'PAGE', u'Lime', u'estate', u'sales', u'Field', u'Revolving', u'border', u'Begin', u'PageNo', u'payments', u'digital', u'stock', u'product', u'Name', u'DHI', u'hang', u'wells', u'segment', u'Page', u'Limes

In [37]:
def mda_features_ter(mda):
    mda_words = word_tokenize(mda.text)
    features = {}
    stand_mda_words = [word.lower() for word in mda_words]
    for word in all_words:
        if word in bestwords:
            features['contains({})'.format(word)] = word in mda_words
    return (features, mda.pos)

#### Model training

In [38]:
feature_train_set = train_set.apply(mda_features_ter,axis=1)
classifier3 = NB.train(feature_train_set)
feature_test_set = test_set.apply(mda_features,axis=1)

#### Results

In [39]:
classifier3.show_most_informative_features(20)

Most Informative Features
          contains(lime) = True                1 : 0      =      5.1 : 1.0
          contains(kiln) = True                1 : 0      =      5.1 : 1.0
        contains(during) = False               1 : 0      =      4.3 : 1.0
            contains(pt) = True                1 : 0      =      4.3 : 1.0
         contains(wells) = True                1 : 0      =      4.0 : 1.0
      contains(payments) = False               0 : 1      =      3.7 : 1.0
       contains(million) = False               0 : 1      =      3.7 : 1.0
          contains(body) = True                1 : 0      =      3.5 : 1.0
          contains(year) = False               1 : 0      =      3.5 : 1.0
       contains(natural) = True                1 : 0      =      3.3 : 1.0
           contains(gas) = True                1 : 0      =      3.1 : 1.0
        contains(border) = True                1 : 0      =      2.8 : 1.0
        contains(bottom) = True                1 : 0      =      2.8 : 1.0

In [40]:
test_set_pred = classifier3.classify_many([fs for (fs, l) in feature_test_set])
print("Accuracy: {}".format(classify.accuracy(classifier3, feature_test_set)))
print(classification_report(test_set.pos, test_set_pred))

Accuracy: 0.571428571429
             precision    recall  f1-score   support

          0       0.54      0.70      0.61        10
          1       0.62      0.45      0.53        11

avg / total       0.58      0.57      0.57        21



In [41]:
tab = pd.crosstab(test_set.pos, pd.DataFrame({'Prediction':test_set_pred}).Prediction, rownames=['Actual'], colnames=['Predicted'],margins=True) # Print confusion matrix
print(tab)

Predicted  0.0  1.0  All
Actual                  
0.0          5    1   10
1.0          1    0   11
All         13    8   35


### Improvement number 3: Multinomial Bayes Classifier 

In [48]:
# We split the dataset in train/test ratio: 0.30
train_set, test_set = train_test_split(dataset, test_size = 0.30)
# We initiate the classifier
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.text.values)
classifier4 = MultinomialNB(fit_prior="False") 

In [49]:
# We fit the training set
classifier4.fit(counts, train_set.pos.values)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior='False')

In [50]:
# Let's do some prediction on the test set
predictions = classifier4.predict(vectorizer.transform(test_set.text.values)) 
test_set_pred = pd.Series(predictions, index=test_set.index)

In [51]:
tab = pd.crosstab(test_set.pos, test_set_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) # Print confusion matrix
print(tab)

Predicted   0  1  All
Actual               
0           8  3   11
1           5  5   10
All        13  8   21


In [66]:
print tab['All']['All']

21


In [71]:

print "Accuracy: %f" %(np.float(tab[0][0]+tab[1][1])/(tab['All']['All']))
print(classification_report(test_set.pos, test_set_pred)) # Print accuracy, precision, recall, F measure

Accuracy: 0.619048
             precision    recall  f1-score   support

          0       0.62      0.73      0.67        11
          1       0.62      0.50      0.56        10

avg / total       0.62      0.62      0.61        21



### Bonus: Improvement number 4: Use the Loughran McDonald dictionary

In [72]:
corp=[]
filenames=[]

for name in dataset['filename']:
      filenames.append(name)
      corp.append(extract_MDA(name))

filenames=[j for j in filenames if corp[filenames.index(j)] is not None]
corp = [i for i in corp if i is not None]

In [73]:
tf = TfidfVectorizer(analyzer='word', min_df = 0, stop_words = 'english')
tfidf_matrix =  tf.fit_transform(corp)
feature_names = tf.get_feature_names() 
tfidf_array = tfidf_matrix.toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = [i.upper() for i in feature_names]

In [82]:
dict = pd.read_excel("LoughranMcDonald_MasterDictionary_2014.xlsx")
minidict = dict[dict['Word'].isin(tfidf_df.columns)] 
minidict = minidict.set_index('Word')
minidict.loc[minidict['Positive']>0, 'Positive'] = 1
minidict.loc[minidict['Negative']>0, 'Negative'] = -1
tfidf_df = tfidf_df.T 
tfidf_df.index.name='Word'

result_df = pd.merge(tfidf_df, minidict, how='left', left_index=True, right_index=True)
result_df = result_df[np.isfinite(result_df['Negative'])]
    

for i in range(0, len(corp)):
    print('Positive score for document ',i,' is: ', sum(result_df[i]*result_df['Positive']))

for i in range(0, len(corp)):
    print('Negative score for document ',i,' is: ', sum(result_df[i]*result_df['Negative']))        

('Positive score for document ', 0, ' is: ', 0.20953324040533489)
('Positive score for document ', 1, ' is: ', 0.20228559947637748)
('Positive score for document ', 2, ' is: ', 0.26371396239930872)
('Positive score for document ', 3, ' is: ', 0.0)
('Positive score for document ', 4, ' is: ', 0.2865921862948797)
('Positive score for document ', 5, ' is: ', 0.24003066534384965)
('Positive score for document ', 6, ' is: ', 0.22206673653978737)
('Positive score for document ', 7, ' is: ', 0.24122032881602809)
('Positive score for document ', 8, ' is: ', 0.15486669311879703)
('Positive score for document ', 9, ' is: ', 0.22035068466832977)
('Positive score for document ', 10, ' is: ', 0.24430912547070144)
('Positive score for document ', 11, ' is: ', 0.23427609002113481)
('Positive score for document ', 12, ' is: ', 0.21887508617275797)
('Positive score for document ', 13, ' is: ', 0.12418245071841545)
('Positive score for document ', 14, ' is: ', 0.39043351240235574)
('Positive score for d

In [83]:
Result=[]
for i in range(0, len(corp)):
    Result.append(sum(result_df[i]*result_df['Positive'])+sum(result_df[i]*result_df['Negative']))
    if Result[i]<0:
        Result[i]=0
    else:
        Result[i]=1

In [84]:
def accuracy(result,label):
    bib=0
    for i in range(1,len(result)):
        if (result[i]==label[i]):
            bib=bib+1
    acc=np.float(bib)/len(result)
    return acc,bib
labels=[]
for label in dataset['pos']:
    labels.append(label)
a=accuracy(Result,labels)  

In [86]:
labels=[]
for label in dataset['pos']:
    labels.append(label)
a=accuracy(Result,labels)  
print "The accuracy for the Loughran McDonald dictionary model  : %f" %a[0]

The accuracy for the Loughran McDonald dictionary model  : 0.507463
