https://scikit-learn.org/stable/auto_examples/text/plot_hashing_vs_dict_vectorizer.html#sphx-glr-auto-examples-text-plot-hashing-vs-dict-vectorizer-py

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]

print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

Loading 20 newsgroups training data
3803 documents - 6.245MB


In [4]:
print(raw_data[0])

Subject: Re: Christian Daemons? [Biblical Demons, the u
From: stigaard@mhd.moorhead.msus.edu
Reply-To: stigaard@mhd.moorhead.msus.edu
Organization: Moorhead State University, Moorhead, MN
Nntp-Posting-Host: 134.29.97.2
Lines: 23

>>>667
>>>the neighbor of the beast
>>
>>No, 667 is across the street from the beast.  664 and 668 are the
>>neighbors of the beast.
>
>I think some people are still not clear on this:
>667 is *not* the neighbor of the beast, but, rather, across the
>street. It is, in fact, 668 which is the neighbor of the beast.

no, sheesh, didn't you know 666 is the beast's apartment?  667 is across the
hall from the beast, and is his neighbor along with the rest of the 6th floor.

>Justin (still trying to figure out what this has to do with alt.discordia)

This doesn't seem discordant to you?

-----------------------     ----------------------     -----------------------
	-Paul W. Stigaard, Lokean Discordian Libertarian
  !XOA!		internet:  stigaard@mhd1.moorhead.msus.edu
 

In [6]:
len(raw_data)

3803

In [5]:
import re


def tokenize(doc):
    """Extract tokens from doc.

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or
    TfidfVectorizer.
    """
    return (tok.lower() for tok in re.findall(r"\w+", doc))


list(tokenize("This is a simple example, isn't it?"))

['this', 'is', 'a', 'simple', 'example', 'isn', 't', 'it']

In [47]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**9)
model = vectorizer.fit(raw_data)
X = model.transform(raw_data)
print(X.shape)

(3803, 512)


In [49]:
vectorizer.n_features

512

In [15]:
for c in X[0]:
    print(c)

  (0, 0)	0.03925343359894298
  (0, 10)	-0.03925343359894298
  (0, 19)	0.03925343359894298
  (0, 36)	-0.03925343359894298
  (0, 41)	-0.03925343359894298
  (0, 54)	0.07850686719788597
  (0, 55)	-0.03925343359894298
  (0, 60)	0.03925343359894298
  (0, 61)	-0.03925343359894298
  (0, 62)	0.07850686719788597
  (0, 76)	0.07850686719788597
  (0, 77)	-0.03925343359894298
  (0, 78)	0.11776030079682893
  (0, 88)	0.03925343359894298
  (0, 91)	0.15701373439577193
  (0, 107)	-0.03925343359894298
  (0, 108)	-0.11776030079682893
  (0, 110)	0.07850686719788597
  (0, 117)	-0.03925343359894298
  (0, 121)	-0.03925343359894298
  (0, 125)	-0.03925343359894298
  (0, 128)	-0.03925343359894298
  (0, 134)	0.03925343359894298
  (0, 135)	-0.03925343359894298
  (0, 141)	-0.07850686719788597
  :	:
  (0, 372)	0.11776030079682893
  (0, 380)	0.03925343359894298
  (0, 388)	-0.03925343359894298
  (0, 397)	0.03925343359894298
  (0, 412)	0.07850686719788597
  (0, 413)	-0.03925343359894298
  (0, 414)	0.03925343359894298
  

In [13]:
X.shape

(3803, 512)

In [50]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1024)
model = vectorizer.fit(raw_data)
X = model.transform(raw_data)
vectorizer.get_feature_names_out()
print(X.shape)

(3803, 1024)
CPU times: total: 609 ms
Wall time: 2.33 s


In [51]:
X = model.transform([raw_data[0]])

for c in X:
    print(c)

  (0, 1021)	0.09151362661419961
  (0, 1009)	0.05957418391896208
  (0, 997)	0.09591874198152693
  (0, 988)	0.06879315809683535
  (0, 983)	0.060054776343078764
  (0, 936)	0.11217113221110285
  (0, 924)	0.11089580533067639
  (0, 910)	0.1346549959504493
  (0, 899)	0.1413787826048561
  (0, 897)	0.14795213507030558
  (0, 885)	0.5432076426016901
  (0, 856)	0.02927179011197917
  (0, 851)	0.18589827976725962
  (0, 845)	0.19788767789341993
  (0, 825)	0.08932128492367386
  (0, 824)	0.09624479859281847
  (0, 823)	0.065484307954295
  (0, 786)	0.11638318164109641
  (0, 754)	0.13068932020328572
  (0, 750)	0.07882097143877041
  (0, 736)	0.04288839098164043
  (0, 734)	0.10681560277206868
  (0, 728)	0.14053846489217006
  (0, 699)	0.05075509928124087
  (0, 672)	0.07979891449331092
  :	:
  (0, 470)	0.08066444821205487
  (0, 464)	0.2717565035939064
  (0, 462)	0.11296901067592482
  (0, 447)	0.03549931204019815
  (0, 440)	0.050278090516203286
  (0, 431)	0.05228524614717385
  (0, 427)	0.09181932783383094
  (0

In [52]:
vectorizer.inverse_transform(X)

[array(['you', 'would', 'with', 'which', 'what', 'university', 'trying',
        'to', 'this', 'think', 'the', 'subject', 'still', 'state',
        'something', 'someone', 'some', 'seem', 'rest', 'reply', 're',
        'rather', 'quote', 'posting', 'people', 'paul', 'out',
        'organization', 'on', 'of', 'not', 'no', 'nntp', 'lines', 'left',
        'large', 'know', 'it', 'is', 'internet', 'in', 'if', 'host', 'his',
        'here', 'has', 'from', 'fact', 'edu', 'doesn', 'do', 'didn',
        'clear', 'christian', 'but', 'at', 'are', 'and', 'alt', 'along',
        '29', '23'], dtype='<U14')]

In [24]:
vectorizer.vocabulary_

{'subject': 87,
 're': 80,
 'the': 92,
 'from': 34,
 'edu': 30,
 'to': 103,
 'organization': 74,
 'university': 105,
 'nntp': 65,
 'posting': 79,
 'host': 44,
 'lines': 55,
 'of': 69,
 'no': 66,
 'is': 49,
 'and': 6,
 'are': 9,
 'think': 99,
 'some': 85,
 'people': 77,
 'not': 67,
 'on': 70,
 'this': 100,
 'but': 16,
 'it': 50,
 'in': 47,
 'which': 118,
 'you': 126,
 'know': 53,
 'his': 43,
 'with': 122,
 'out': 76,
 'what': 116,
 'has': 39,
 'do': 26,
 'at': 12,
 'if': 46,
 'here': 42,
 'would': 124,
 'please': 78,
 'writes': 125,
 'have': 40,
 'about': 1,
 'for': 33,
 'your': 127,
 'can': 19,
 'world': 123,
 'there': 96,
 'they': 98,
 'their': 93,
 'why': 120,
 'be': 13,
 'more': 60,
 'than': 90,
 'car': 20,
 'us': 107,
 'drive': 29,
 'article': 10,
 'well': 114,
 'used': 109,
 'way': 112,
 'was': 111,
 'that': 91,
 'all': 2,
 'one': 71,
 'so': 84,
 'been': 15,
 'then': 95,
 'just': 52,
 'com': 21,
 'even': 31,
 'first': 32,
 'also': 3,
 'computer': 22,
 'by': 17,
 'who': 119,
 'up':

In [27]:
for c in X[0]:
    print(c)

  (0, 126)	0.11400721627511316
  (0, 124)	0.07421721902788872
  (0, 122)	0.11949508686860338
  (0, 118)	0.08570217074291997
  (0, 116)	0.07481593865537586
  (0, 105)	0.13974223296524052
  (0, 103)	0.1677525174536722
  (0, 100)	0.17612897708769276
  (0, 99)	0.1843180265648858
  (0, 92)	0.6767253520993782
  (0, 87)	0.03646664906118272
  (0, 85)	0.08158002185887603
  (0, 80)	0.053430142015340104
  (0, 79)	0.0632304476929496
  (0, 77)	0.09941308676916245
  (0, 76)	0.0797442267512413
  (0, 74)	0.03772474889206225
  (0, 70)	0.05681888145210847
  (0, 69)	0.2125532561801561
  (0, 67)	0.12314251178160003
  (0, 66)	0.1598032286916699
  (0, 65)	0.06558128862039751
  (0, 55)	0.0365050148185997
  (0, 53)	0.08464509240861498
  (0, 50)	0.10049136432756635
  (0, 49)	0.3385528868833151
  (0, 47)	0.04422486459253297
  (0, 46)	0.06263619256993903
  (0, 44)	0.06513669697113131
  (0, 43)	0.11438805731869899
  (0, 42)	0.10492687419879328
  (0, 39)	0.07856977553082634
  (0, 34)	0.10939994718354817
  (0, 30)	