In [None]:
Word frequency arrays
● Rows represent documents, columns represent words
● Entries measure presence of each word in each document

Sparse arrays and csr_matrix
● Array is "sparse": most entries are zero
● Can use scipy.sparse.csr_matrix instead of NumPy array
● csr_matrix remembers only the non-zero entries (saves space!)

● scikit-learn PCA doesn't support csr_matrix
● Use scikit-learn TruncatedSVD instead

In [5]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']

In [3]:
# Create a TfidfVectorizer
tfidf = TfidfVectorizer() 
# Apply fit_transform to document
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

[[ 0.51785612  0.          0.          0.68091856  0.51785612  0.        ]
 [ 0.          0.          0.51785612  0.          0.51785612  0.68091856]
 [ 0.51785612  0.68091856  0.51785612  0.          0.          0.        ]]


In [4]:
# Get the words
words = tfidf.get_feature_names()
print(words)

['cats', 'chase', 'dogs', 'meow', 'say', 'woof']


In [6]:
# Create a TruncatedSVD instance
svd = TruncatedSVD(n_components=50)
# Create a KMeans instance
kmeans = KMeans(n_clusters=6)
# Create a pipeline
pipeline = make_pipeline(svd,kmeans)

In [83]:
df1 = pd.read_csv("..\data\Wikipedia articles\wikipedia-vectors.csv")
df1 = df1.iloc[:,1:]
df1 = df1.T
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [87]:
titles = df1.index
titles.shape

(60,)

In [89]:
df1.reset_index
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [91]:
articles = df1.iloc[:,1:]
articles.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [95]:
# Fit the pipeline to articles
pipeline.fit(articles)
# Calculate the cluster labels
labels = pipeline.predict(articles)

In [93]:
labels.shape

(60,)

In [96]:
# Create a DataFrame aligning labels and titles
df2 = pd.DataFrame({'label': labels, 'article': titles})
# Display df sorted by cluster label
print(df2.sort_values('label'))

                                          article  label
34                             Zlatan Ibrahimović      0
31                              Cristiano Ronaldo      0
35                Colombia national football team      0
36              2014 FIFA World Cup qualification      0
37                                       Football      0
38                                         Neymar      0
39                                  Franck Ribéry      0
33                                 Radamel Falcao      0
30                  France national football team      0
32                                   Arsenal F.C.      0
21                             Michael Fassbender      1
22                              Denzel Washington      1
23                           Catherine Zeta-Jones      1
24                                   Jessica Biel      1
25                                  Russell Crowe      1
26                                     Mila Kunis      1
20                             