## Import packages


In [1]:
!pip install eli5
!pip install tabulate



In [2]:
import eli5
import tabulate
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



# Feature Engineering

## Prepare dataset and Pick two classes


In [3]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'),)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'),)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


## Design My own features


In [4]:
class CustomFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.feat_names = set()

    def fit(self, x, y=None):
        return self

    @staticmethod
    def features(review):
      return {
          'bias' : 1.0,
          'RAM' : test_binary_feature(review),
          'mac': mac_binary_feature(review),
          'quadra':quadra_binary_feature(review),
          'apple': apple_binary_feature(review),
          'windows': windows_binary_feature(review),
          'pc': pc_binary_feature(review),
          'gateway':gateway_binary_feature(review),
          'centris':centris_binary_feature(review),
          'powerbook':powerbook_binary_feature(review),
          'macintosh':macintosh_binary_feature(review),
          'floppy': floppy_binary_feature(review),
          'motherboard':motherboard_binary_feature(review),
          'propn': propn(review)
          
      }

    def get_feature_names(self):
        return list(self.feat_names)
      
    def transform(self, reviews):
      feats = []
      for review in reviews:
        f = self.features(review)
        [self.feat_names.add(k) for k in f] 
        feats.append(f)
      return feats
    

feats = make_pipeline(CustomFeats(), DictVectorizer())
# FeatureUnion([
#     ('custom', make_pipeline(CustomFeats(), DictVectorizer())),
#     ('bag_of_words', CountVectorizer())
# ])

In [5]:
feats

Pipeline(memory=None,
         steps=[('customfeats', CustomFeats()),
                ('dictvectorizer',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True))],
         verbose=False)

In [6]:
def test_binary_feature(review):
  target_word = 'net'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  return count > threshold

def mac_binary_feature(review):
  target_word = 'mac'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  if count > threshold:
        return 1
  else:
        return 0
    

def quadra_binary_feature(review):
    target_word = 'quadra'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0
    

def apple_binary_feature(review):
    target_word = 'apple'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0
    
def windows_binary_feature(review):
    target_word = 'windows'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0
    
def pc_binary_feature(review):
    target_word = 'pc'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0

def gateway_binary_feature(review):
    target_word = 'gateway'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0

def centris_binary_feature(review):
    target_word = 'centris'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0

def controller_binary_feature(review):
    target_word = 'controller'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0    
    
def powerbook_binary_feature(review):
    target_word = 'powerbook'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0 

    
def macintosh_binary_feature(review):
    target_word = 'macintosh'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0    
    
def floppy_binary_feature(review):
    target_word = 'floppy'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0  

def motherboard_binary_feature(review):
    target_word = 'motherboard'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0  
    
def propn(review):
    target_word = 'propn'
    threshold = 0
    words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
    count = len(list(words))
    if count > threshold:
        return 1
    else:
        return 0   

    
    
def show_table(train, Ω):
  matrix = np.zeros((2, 2))
  for i in range(len(train.data)):
    flag = Ω(train.data[i])
    index = 0 if flag else 1
    matrix[index][train.target[i]] += 1
  print(tabulate.tabulate([['True', matrix[0][0], matrix[0][1]], ['False', matrix[1][0], matrix[1][1]]], headers=['', train.target_names[0], train.target_names[1]]))


show_table(train, lambda r: r.find('apple') is not -1)

         comp.sys.ibm.pc.hardware    comp.sys.mac.hardware
-----  --------------------------  -----------------------
True                            2                       24
False                         588                      554


In [7]:

!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K     |████████████████████████████████| 37.4MB 1.1MB/s eta 0:00:01    |██████▎                         | 7.3MB 1.3MB/s eta 0:00:24
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/suzuki96/Library/Caches/pip/wheels/54/7c/d8/f86364af8fbba7258e14adae115f18dd2c91552406edc3fdaa
Successfully built en-core-web-sm


## Number of Features
(#sample, #features)  

In [8]:
train.vecs = feats.fit_transform(train.data)
test.vecs = feats.fit_transform(test.data)
train.vecs.shape, test.vecs.shape, feats.steps[0][1].get_feature_names()

((1168, 14),
 (777, 14),
 ['centris',
  'mac',
  'RAM',
  'quadra',
  'macintosh',
  'powerbook',
  'gateway',
  'apple',
  'pc',
  'floppy',
  'propn',
  'windows',
  'motherboard',
  'bias'])

In [9]:
train.vecs[0]

<1x14 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [10]:
lr_model = make_pipeline(CountVectorizer(), LogisticRegression())
lr_model.fit(train.data, train.target)
train_preds = lr_model.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = lr_model.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1 

(0.9897260273972602, 0.8082368082368082)

## False negative and positive examples

In [11]:
def show_false_negative(test_preds, test):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 0, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

def show_false_positive(test_preds, test, size=2):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 1, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

In [12]:
show_false_negative(test_preds, test)



In [13]:
show_false_positive(test_preds, test)

"\tFirst you need to connect them with a null modem cable.\n\nAtleast thats what I've heard."

# Error Analysis

In [14]:
#categories = ['alt.atheism', 'soc.religion.christian']
#categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1197
test data size: 796


In [15]:
lr_model = LogisticRegression(C=0.1)
vec = CountVectorizer()
pipe = make_pipeline(vec, lr_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

(1.0, 0.943467336683417)

In [16]:
eli5.show_weights(pipe, top=10, target_names=test.target_names)

Weight?,Feature
+0.729,hockey
+0.585,nhl
+0.391,playoff
+0.343,espn
+0.341,ca
+0.314,pittsburgh
… 9484 more positive …,… 9484 more positive …
… 9078 more negative …,… 9078 more negative …
-0.337,runs
-0.357,phillies


In [17]:
idx = 1
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(lr_model, test.data[idx], vec=vec, target_names=test.target_names)

rec.sport.baseball


Contribution?,Feature
0.944,edu
0.574,sox
0.499,<BIAS>
0.253,list
0.247,me
0.243,if
0.223,mailing
0.2,re
0.169,info
0.167,is


In [18]:
rf_model = RandomForestClassifier()
vec = CountVectorizer()
pipe = make_pipeline(vec, rf_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

(1.0, 0.9296482412060302)

In [19]:
eli5.show_weights(pipe, top=10, target_names=test.target_names)

Weight,Feature
0.0335  ± 0.1126,hockey
0.0162  ± 0.0595,nhl
0.0106  ± 0.0430,playoff
0.0103  ± 0.0465,baseball
0.0098  ± 0.0373,ca
0.0096  ± 0.0436,playoffs
0.0086  ± 0.0379,cup
0.0084  ± 0.0393,leafs
0.0082  ± 0.0373,wings
0.0075  ± 0.0330,goals


In [20]:
idx = 1
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(rf_model, test.data[idx], vec=vec, target_names=test.target_names, top=10)

rec.sport.baseball


Contribution?,Feature
+0.501,<BIAS>
+0.030,sox
+0.020,hockey
+0.009,edu
+0.009,ca
+0.008,nhl
+0.008,very
… 1514 more positive …,… 1514 more positive …
… 486 more negative …,… 486 more negative …
-0.013,baseball
