# TfidfVectorizer


In [81]:
import spacy

In [82]:
nlp = spacy.load('en_core_web_lg')

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [84]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]

In [85]:
text_cleaned = []

In [86]:
doc = nlp(text)

In [87]:
for token in doc:
    text_cleaned.append(f'{token}')

In [97]:
vectorizer = TfidfVectorizer(ngram_range=(3,3))

In [98]:
vectors = vectorizer.fit_transform(corpus)

In [99]:
import pandas as pd

In [100]:
df = pd.DataFrame(vectors.toarray() , columns=vectorizer.get_feature_names())

In [101]:
df

Unnamed: 0,and this is,document is the,is the first,is the second,is the third,is this the,the first document,the second document,the third one,this document is,this is the,this the first
0,0.0,0.0,0.667679,0.0,0.0,0.0,0.526405,0.0,0.0,0.0,0.526405,0.0
1,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0
2,0.525473,0.0,0.0,0.0,0.525473,0.0,0.0,0.0,0.525473,0.0,0.414289,0.0
3,0.0,0.0,0.0,0.0,0.0,0.617614,0.486934,0.0,0.0,0.0,0.0,0.617614


# New practice

In [102]:
categories = ['alt.atheism' , 'soc.religion.christian' , 
             'comp.graphics' , 'sci.med']

In [104]:
#load the data matching categories
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train' , 
                categories=categories , shuffle=True , random_state=42)

In [106]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [107]:
#check length

len(twenty_train.data)

2257

In [108]:
len(twenty_train.filenames)

2257

In [109]:
print('\n'.join(twenty_train.data[0].split('\n')[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [110]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [111]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [112]:
#get cartegory names

for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


# Extracting Features from text

In [117]:
from sklearn.feature_extraction.text import CountVectorizer  , TfidfTransformer

In [114]:
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(twenty_train.data)

In [115]:
X_train_counts.shape

(2257, 35788)

In [116]:
count_vect.vocabulary_.get('algorithm')

4690

In [118]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [119]:
X_train_tf = tf_transformer.transform(X_train_counts)

In [120]:
X_train_tf.shape

(2257, 35788)

In [121]:
#Training a classifier

In [122]:
from sklearn.naive_bayes import MultinomialNB

In [124]:
clf = MultinomialNB().fit(X_train_tf , twenty_train.target)

In [127]:
#TESTING
docs_new = ['God is love' , 'OpenGL on the GPU is fast' ]

In [128]:
X_new_counts = count_vect.transform(docs_new)

In [129]:
X_new_tfidf = tf_transformer.transform(X_new_counts)

In [130]:
predicted = clf.predict(X_new_tfidf)

In [131]:
for doc  , category in zip(docs_new , predicted):
    print(f'{doc} => {twenty_train.target_names[category]}')

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


# Building a pipeline

- In order to make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a Pipeline class that behaves like a compound classifier:

In [132]:
from sklearn.pipeline import Pipeline

In [133]:
text_clf = Pipeline([
    ('vect' , CountVectorizer()),
    ('tfidf' , TfidfTransformer()),
    ('clf' , MultinomialNB())
])

In [134]:
text_clf.fit(twenty_train.data , twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

# Evaluation of the performence on the test set

In [135]:
import numpy as np

In [136]:
twenty_test = fetch_20newsgroups(subset='test' ,
                categories=categories , shuffle=True , random_state=42)

In [137]:
docs_test = twenty_test.data

In [139]:
predicted = text_clf.predict(docs_test)

In [140]:
np.mean(predicted == twenty_test.target)

0.8348868175765646

## Lets see if we can do better 

In [142]:
from sklearn.linear_model import SGDClassifier

In [143]:
text_clf= Pipeline([
    ('vect' , CountVectorizer()),
    ('tfidf' , TfidfTransformer()),
    ('clf' , SGDClassifier(loss='hinge' , 
                          penalty='l2' , 
                          alpha=1e-3 , 
                          random_state=42,
                          max_iter=5,
                          tol=None))
])

In [144]:
text_clf.fit(twenty_train.data , twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [146]:
predicted = text_clf.predict(docs_test)

In [147]:
np.mean(predicted == twenty_test.target)

0.9101198402130493

# Using metrics analysis

In [148]:
from sklearn import metrics

In [150]:
print(metrics.classification_report(twenty_test.target , predicted , 
                                   target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



# Parameter tuning using Grid Search

In [151]:
from sklearn.model_selection import GridSearchCV

In [152]:
parameters = {
    'vect__ngram_range' : [(1,1) , (1,2)],
    'tfidf__use_idf':(True , False),
    'clf__alpha':(1e-2 , 1e-3),
}

In [153]:
#give njobs -1 so that gridsearch use all cores

gs_clf = GridSearchCV(text_clf , parameters , cv=5,n_jobs=-1)

In [154]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])



In [155]:
gs_clf.best_score_

0.905

In [156]:
for param_name in sorted(parameters.keys()):
    print(f'{param_name} , {gs_clf.best_params_[param_name]}')

clf__alpha , 0.001
tfidf__use_idf , True
vect__ngram_range , (1, 1)


In [157]:
gs_clf.cv_results_

{'mean_fit_time': array([0.37335768, 0.54320054, 0.1623991 , 0.53979788, 0.15100117,
        0.52039962, 0.14219966, 0.50479975]),
 'std_fit_time': array([0.10518793, 0.03778071, 0.00618362, 0.02664987, 0.0112594 ,
        0.02181301, 0.00331068, 0.03241374]),
 'mean_score_time': array([0.02800179, 0.07399907, 0.02960005, 0.06220016, 0.02759719,
        0.06560006, 0.03220048, 0.05619793]),
 'std_score_time': array([0.00346669, 0.00923098, 0.00287154, 0.00921897, 0.00241581,
        0.00755249, 0.00818252, 0.00730442]),
 'param_clf__alpha': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001, 0.001],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__use_idf': masked_array(data=[True, True, False, False, True, True, False, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range'