In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abcd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.motorcycles']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
newsgroup = dataset.data

In [6]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class',  
               ])

In [7]:
newsgroup[0]

"From: thompson@apple.com (Paul Thompson)\nSubject: Re: Back Breaker, Near Hit!!\nOrganization: Apple Computer, Inc., Cupertino, California\nLines: 29\nNNTP-Posting-Host: apple.com\n\ninde7wv@Rosie.UH.EDU writes:\n\n>hear screeching tires.  I dart my eyes to my mirrors and realize it's the \n>moroon flying up right behind me, in my panic I pop my clutch and stall the\n>bike.  Luckily the guy stops a foot behind my rear wheel.\n>I understand why you theoretically stop so far behind a car but can you\n>really in actuality avoid such an incident?  Suggestions?\n\nThe experience you describe is why I don't like to sit with my bike in\ngear - I figure there's a chance that I'll be startled in some way and\nstall my bike.  And I figure this is more likely than the chance that\nI'll be unable to escape some situation because of the extra time\nneeded to put the bike in gear.\n\nSo I concentrate on avoiding situations rather than making split-second\nevasive manuvers.  I split lanes so I'm not

In [8]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(newsgroup)

In [9]:
X[0]


<1x156670 sparse matrix of type '<class 'numpy.float64'>'
	with 413 stored elements in Compressed Sparse Row format>

In [13]:
print (X[0])

  (0, 103558)	0.0449646200163
  (0, 33520)	0.0530455548115
  (0, 91014)	0.0530455548115
  (0, 149078)	0.0530455548115
  (0, 53120)	0.0530455548115
  (0, 48014)	0.0530455548115
  (0, 107314)	0.0530455548115
  (0, 134139)	0.0530455548115
  (0, 26576)	0.0530455548115
  (0, 106138)	0.0530455548115
  (0, 137037)	0.0530455548115
  (0, 66152)	0.0530455548115
  (0, 67126)	0.0530455548115
  (0, 26649)	0.0530455548115
  (0, 21447)	0.0530455548115
  (0, 111659)	0.0530455548115
  (0, 65156)	0.0530455548115
  (0, 14495)	0.0530455548115
  (0, 30430)	0.0530455548115
  (0, 156136)	0.0530455548115
  (0, 55900)	0.0530455548115
  (0, 131806)	0.0530455548115
  (0, 53993)	0.0530455548115
  (0, 29831)	0.0530455548115
  (0, 93032)	0.0530455548115
  :	:
  (0, 121595)	0.0449646200163
  (0, 66895)	0.0344673460151
  (0, 153892)	0.010502143776
  (0, 48389)	0.0117443100628
  (0, 144024)	0.0339012526248
  (0, 118978)	0.0412071993138
  (0, 72154)	0.0412071993138
  (0, 69657)	0.0120451379433
  (0, 107680)	0.011879526

In [14]:
X.shape

(996, 156670)

In [15]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [16]:
#first row for V
lsa.components_[0]

array([ 0.00645291,  0.00157301,  0.00157301, ...,  0.00054533,
        0.00054533,  0.00054533])

In [21]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print ("Concept %d:" % i)
    for term in sortedTerms:
        print (term[0])
    print (" ")

Concept 0:
com
sun
sun com
east sun
east sun com
east
edu
ed
ed green
green
 
Concept 1:
sun
east sun
east sun com
sun com
east
ed green
green
ed
egreen
egreen east
 
Concept 2:
reply
university
co uk
tony
subject
get
east sun
east sun com
morgan
really
 
Concept 3:
organization
one
00 22
would
said
like
host
morgan demon
morgan demon co
hydro ca
 
Concept 4:
00 23
right
nec
know
university
think
org
behanna
organization
anyone
 
Concept 5:
sun
would
subject
reply
get
lines
uk
least
egreen
egreen east
 
Concept 6:
dod
back
duke
would
lines
hydro ca
left
good
levine
got
 
Concept 7:
east
ca
ed green
edu
world
writes
article
subject
lines
bike
 
Concept 8:
ed green
world
go
got
infante
sun
chain
ride
bnr
good
 
Concept 9:
like
subject
bmw
green
east
get
bnr
cs
would
university
 
Concept 10:
helmet
even
green
well
ed
ed green
00 01 10
east
would
said
 
Concept 11:
east
well
00 23
bike
ride
little
one
last
stafford
feet
 
Concept 12:
edu
writes
00 22 22
subject
ed
00 22
much
sun
cs
biker
 