## Import packages
Make sure you installed ***eli5***, ***tabulate***, ***sklearn***, ***matplotlib*** and ***numpy*** if you use your local machine

In [1]:
!pip3 install -U eli5



In [2]:
import eli5
import tabulate
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import spacy
import pandas as pd
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Fun with Spacy
NER/Part of speech tagging
https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/

In [3]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

displacy.render(doc, style="dep")
displacy.render(doc, style="ent")

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [4]:
doc = nlp("Thuan Pham, hired as Uber’s chief technology officer by former CEO Travis Kalanick back in 2013, is leaving the company in three weeks, the ride-share giant revealed today in an SEC filing that came out just as The Information reported that massive layoffs at Uber are being proposed to preserve some of the company’s dwindling capital reserves.")
displacy.render(doc, style="ent")

# Feature Engineering

## Prepare dataset and Pick two classes
Your two classes should be similar, but opposite in some sense

In [5]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'),)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'),)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


## Design your own features
This is also warm-up for HW2 :)

In [6]:
class CustomFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.feat_names = set()

    def fit(self, x, y=None):
        return self

    @staticmethod
    def features(review):
      return {
          'bias' : 4.0,
          'RAM' : test_binary_feature(review),
          'mac': mac_binary_feature(review),
          'apple': apple_binary_feature(review),
          'IBM': ibm_feature(review),
          'pc': pc_feature(review)
      }

    def get_feature_names(self):
        return list(self.feat_names)
      
    def transform(self, reviews):
      feats = []
      for review in reviews:
        f = self.features(review)
        [self.feat_names.add(k) for k in f] 
        feats.append(f)
      return feats
    
feats = make_pipeline(CustomFeats())
#feats = make_pipeline(CustomFeats(), DictVectorizer())
#feats = FeatureUnion([
#     ('custom', make_pipeline(CustomFeats(), DictVectorizer())),
#     ('bag_of_words', CountVectorizer())
# ])

In [7]:
def test_binary_feature(review):
  target_word = 'RAM'
  threshold = 0
  words = filter(lambda r: r.find(target_word) != -1, review.split(' '))
  count = len(list(words))
  return count > threshold

def mac_binary_feature(review):
  target_word = 'mac'
  threshold = 0
  words = filter(lambda r: r.find(target_word) != -1, review.split(' '))
  count = len(list(words))
  if count > threshold:
        return 1
  else:
        return 0

def ibm_feature(review):
  target_word = 'IBM'
  threshold = 0
  words = filter(lambda r: r.find(target_word) != -1, review.split(' '))
  count = len(list(words))
  return count

def apple_binary_feature(review):
  target_word = 'apple'
  threshold = 0
  words = filter(lambda r: r.find(target_word) != -1, review.split(' '))
  count = len(list(words))
  if count > threshold:
        return 1
  else:
        return 0
    
def pc_feature(review):
  target_word = 'pc'
  threshold = 0
  words = filter(lambda r: r.find(target_word) != -1, review.split(' '))
  count = len(list(words))
  return count

def show_table(train, Ω):
  matrix = np.zeros((2, 2))
  for i in range(len(train.data)):
    flag = Ω(train.data[i])
    index = 0 if flag else 1
    matrix[index][train.target[i]] += 1
  print(tabulate.tabulate([['True', matrix[0][0], matrix[0][1]], ['False', matrix[1][0], matrix[1][1]]], headers=['', train.target_names[0], train.target_names[1]]))


show_table(train, lambda r: r.find('apple') != -1)

         comp.sys.ibm.pc.hardware    comp.sys.mac.hardware
-----  --------------------------  -----------------------
True                            2                       24
False                         588                      554


In [8]:
train.custvector = feats.fit_transform(train.data)

In [9]:
train.custvector[10] # checking 10th document

{'bias': 4.0, 'RAM': False, 'mac': 0, 'apple': 0, 'IBM': 0, 'pc': 0}

## Number of Features
(#sample, #features)  

In [10]:
train.vecs = feats.fit_transform(train.data)
test.vecs = feats.transform(test.data)

In [11]:
feats.steps[0][1].get_feature_names()

['apple', 'mac', 'pc', 'RAM', 'IBM', 'bias']

In [12]:
print(test.vecs[1])

{'bias': 4.0, 'RAM': False, 'mac': 0, 'apple': 0, 'IBM': 0, 'pc': 0}


What if we add
- number-based feature with threshold
- number-based feature  

to ***features*** function?

In [13]:
lr_model = LogisticRegression(C=1)
lr_model.fit(pd.DataFrame.from_dict(train.vecs),train.target)
#lr_model = make_pipeline(CountVectorizer(), LogisticRegression())
#lr_model.fit(train.data, train.target)

train_preds = lr_model.predict(pd.DataFrame.from_dict(train.vecs))
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = lr_model.predict(pd.DataFrame.from_dict(test.vecs))
test_f1 = f1_score(test.target, test_preds, average='micro')
print(train_f1, test_f1)

0.5590753424657534 0.5868725868725869


In [14]:
eli5.show_weights(lr_model, top=10, vec=feats.steps[0][1], target_names=test.target_names)

Weight?,Feature
1.805,RAM
0.54,pc
0.322,mac
-0.0,IBM
-0.011,<BIAS>
-0.03,apple
-1.17,bias


## False negative and positive examples

In [15]:
def show_false_negative(test_preds, test):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 0, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

def show_false_positive(test_preds, test, size=2):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 1, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

In [16]:
show_false_negative(test_preds, test)

"What is the maximum rate of the 6882 FPU that Apple sells directly (Apple\nPart No. M6775 LL/A)?  The Apple literature labels the FPU for Classics and\nLC III's so I assume it will do at least 25MHz.  My question is can I put\nit in a Performa 600 (68030 @ 32MHz)?  The Apple price is cheap at $78\ncompared to ~$135 from mail order houses.  Any one know the answer to this\none?\n"

In [17]:
show_false_positive(test_preds, test)

'Is it possible to put more than 1 controller in a PC.  By this I mean of \ndifferent types.  ie.  RLL and MFM.  If so how do you access the drives\nin the CMOS setup.  Do they just show up to be configured or do you \nhave to do low level writes to the controller.  \n\nAs an example put 1 RLL controller with 2 drives in a machine.  Put\na MFM controller and 2 more drives connected to it.  I now have 4\ndrives with 2 controllers of different types.  Also can you \nput 2 controllers of the same type into a PC and again how do you access \nthem.  \n\nI was asked this question and never tried to do it so if anyone has\ndone this and can supply me with info I would very pleased\n\nThanx in advance for any info...\n\n'

# Error Analysis

In [18]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


In [19]:
lr_model = LogisticRegression(C=0.1)
vec = CountVectorizer()
pipe = make_pipeline(vec, lr_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

(0.9991438356164384, 0.8970398970398971)

In [21]:
idx = 10
x = test.data[idx]
#print(test.data[idx])
print(test.target_names[test.target[idx]])
eli5.show_prediction(lr_model, test.data[idx], vec=vec, target_names=test.target_names)

comp.sys.mac.hardware


Contribution?,Feature
5.893,x10996
3.411,x3258
1.499,x15810
1.331,x11842
1.287,x8956
1.221,x7733
1.198,x4963
0.742,x12006
0.525,x10546
0.422,x8488


In [22]:
rf_model = RandomForestClassifier()
vec = CountVectorizer()
pipe = make_pipeline(vec, rf_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

(1.0, 0.8983268983268984)

In [24]:
idx = 1
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(rf_model, test.data[idx], vec=vec, target_names=test.target_names, top=10)

comp.sys.ibm.pc.hardware


Contribution?,Feature
+0.506,<BIAS>
+0.074,x7922
+0.032,x7616
+0.016,x14434
+0.015,x7797
+0.013,x10524
+0.012,x3258
+0.012,x5073
… 810 more positive …,… 810 more positive …
… 802 more negative …,… 802 more negative …
