In [234]:
#Import libraries
import pandas as pd
import numpy as np
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score


In [235]:
#Read in 'twitter_df'
twitter_df = pd.read_csv("../dataset/twitter_df.csv")

In [236]:
twitter_df.shape

(22153, 8)

In [237]:
twitter_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [238]:
# #Create our contextual stop words
# stops = ["online", "home", "remote", "work", "working","pandemic","business", \
#                "internet", "remotework", "online","team","office","company","hybrid","employee","looking"]

In [239]:
len(twitter_df[twitter_df.duplicated("tweets")])

4557

In [240]:
# originaltweets =twitter_df.drop_duplicates(subset=['tweets'], keep='first').reset_index(drop=True)

In [241]:
originaltweets = twitter_df[~twitter_df['tweets'].str.contains('RT')]

In [242]:
originaltweets = originaltweets.reset_index(drop=True)

In [243]:
originaltweets =originaltweets.drop_duplicates(subset=['cleaned_tweets'], keep='first').reset_index(drop=True)

In [244]:
originaltweets['cleaned_tweets'].value_counts().sort_values()

rise slowly becoming normal which mean need remote training also growing here benefit challenge best practice when come successfully training remote employee    1
need adapt your process remote work continue one designed office based work lean sigma best achieve this                                                         1
prevent your home router from being targeted hacker                                                                                                              1
good news open this excellent report from remote advert soaring this provides encouragement seeker australia share remote job indust                             1
four way energize post pandemic workforce                                                                                                                        1
                                                                                                                                                                ..
here stay even with va

In [245]:
X = originaltweets['cleaned_tweets']
X.head()

0    rise slowly becoming normal which mean need re...
1    opportunity join fantastic team tech fast pace...
2    good news open this excellent report from remo...
3            four way energize post pandemic workforce
4    these tool that will save your google meet too...
Name: cleaned_tweets, dtype: object

### CV

In [246]:
#https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document
#https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer
cv = CountVectorizer(max_features=5000, min_df=3, stop_words=stopwords.words('english')) 
X_cv = cv.fit_transform(X.values.astype('U'))  

In [247]:
X.shape

(10781,)

In [248]:
len(cv.get_feature_names())

3783

In [249]:
X_cv_df = pd.DataFrame(X_cv.toarray(), columns = cv.get_feature_names())

In [250]:
X_cv_df.shape

(10781, 3783)

In [251]:
df = pd.concat([originaltweets, X_cv_df], axis=1)

In [252]:
df = df.reset_index(drop=True)

In [253]:
df.head()

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,ability,able,abroad,...,yorker,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",US,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1416180635903868934,2021-07-16 23:39:09+00:00,Four Ways to Energize a Post-Pandemic Workforc...,four way energize post pandemic workforce,"Chicago, IL",US,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1416180231350759425,2021-07-16 23:37:33+00:00,🚑 These are the tools that will save your #Rem...,these tool that will save your google meet too...,"Duluth, GA",US,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [254]:
cv_col = df.columns[7:]
cv_col_names = df[cv_col]

In [255]:
X_cv=cv_col_names
sc = StandardScaler()
X_cv_sc = sc.fit_transform(X_cv)
X_cv_sc[0:1]

array([[-0.05370029, -0.06334373, -0.01668367, ..., -0.01668367,
        -0.05022671, -0.08211041]])

In [256]:
X_cv.head()

Unnamed: 0,ability,able,abroad,absence,absolutely,academia,accelerate,accelerated,accelerating,acceleration,...,yorker,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [257]:
from sklearn.cluster import KMeans
km_cv = KMeans(n_clusters=4, random_state=42)
km_cv.fit(X_cv_sc)

KMeans(n_clusters=4, random_state=42)

In [258]:
for n in range(3, 16):
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(X_cv_sc)
    print(n,silhouette_score(X_cv_sc, kmeans.labels_))

3 0.1923824839437538
4 0.04169385979130381
5 0.01768652567155152
6 0.014508429105991273
7 0.0762021777144707
8 0.07824298712028345
9 -0.2376116037641056
10 0.06771088483817042
11 -0.04676873164537919
12 0.07790537461570765
13 -0.017337324221319773
14 -0.032976036985163354
15 -0.11585384050410222


In [259]:
km_cv.cluster_centers_

array([[-0.05370029, -0.06334373, -0.01668367, ..., -0.01668367,
        -0.05022671, -0.08211041],
       [ 0.00043687,  0.00051533,  0.00013573, ...,  0.00013573,
         0.00040861,  0.000668  ],
       [-0.05370029, -0.06334373, -0.01668367, ..., -0.01668367,
        -0.05022671, -0.08211041],
       [-0.05370029, -0.06334373, -0.01668367, ..., -0.01668367,
        -0.05022671, -0.08211041]])

In [260]:
df['km']=km_cv.labels_

In [261]:
df['km'].value_counts()

1    10694
2       65
0       19
3        3
Name: km, dtype: int64

In [262]:
silhouette_score(X_cv_sc, km_cv.labels_)

0.04169385979130381

In [263]:
df[df['km'] == 3]

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,ability,able,abroad,...,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom,km
2906,1414648031744516102,2021-07-12 18:09:08+00:00,"Come help build a spot-market for hashrate, no...",come help build spot market hashrate dissimila...,Remote,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
7264,1416524641888256002,2021-07-17 22:26:07+00:00,"Come help build a spot-market for hashrate, no...",come help build spot market hashrate dissimila...,Remote,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
7266,1416523133180383237,2021-07-17 22:20:07+00:00,"Come help build a spot-market for hashrate, no...",come help build spot market hashrate dissimila...,London | New York,US,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [264]:
df[df['km'] == 3]['cleaned_tweets'].value_counts()

come help build spot market hashrate dissimilar compute power version henry remote cloud infrastructure engineer kubernetes                                1
come help build spot market hashrate dissimilar compute power version henry remote data engineer equity                                                    1
come help build spot market hashrate dissimilar compute power version henry check this remote data engineer role working with data elasticsearch hadoop    1
Name: cleaned_tweets, dtype: int64

In [265]:
word_frequencies_by_cluster = df.groupby('km').sum()

In [266]:
word_frequencies_by_cluster.head()

Unnamed: 0_level_0,tweet_id,polarity,ability,able,abroad,absence,absolutely,academia,accelerate,accelerated,...,yorker,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom
km,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.689141e+19,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.514698e+22,5244.0,31.0,45.0,3.0,4.0,12.0,5.0,4.0,13.0,...,3.0,19.0,4.0,3.0,39.0,3.0,35.0,3.0,29.0,76.0
2,9.207637e+19,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.247696e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [267]:
#https://stackoverflow.com/questions/27889873/clustering-text-documents-using-scikit-learn-kmeans-in-python
print("Top terms per cluster:")
order_centroids = km_cv.cluster_centers_.argsort()[:, ::-1]
terms = cv.get_feature_names()
for i in range(4):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :20]:
        print( ' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 warn
 horizon
 putting
 staying
 cyber
 potential
 secure
 risk
 expert
 contact
 anywhere
 pandemic
 tip
 help
 employee
 work
 miller
 corp
 consulting
 industrial
Cluster 1:
 apply
 hiring
 remotely
 today
 working
 manager
 developer
 product
 start
 customer
 great
 post
 stay
 money
 week
 look
 experience
 opportunity
 competitive
 lead
Cluster 2:
 small
 effort
 rabbitrun
 bureaucracy
 sized
 business
 decided
 baby
 owner
 patio
 enjoyable
 kitchen
 located
 teamwork
 purpose
 folding
 writes
 yellow
 newport
 scalable
Cluster 3:
 dissimilar
 hashrate
 compute
 henry
 version
 spot
 elasticsearch
 power
 market
 build
 come
 kubernetes
 data
 infrastructure
 equity
 help
 engineer
 cloud
 role
 remote


### TFIDF

In [268]:
#https://stackoverflow.com/questions/37593293/how-to-get-tfidf-with-pandas-dataframe
tfidf = TfidfVectorizer(max_features=5000,stop_words=stopwords.words('english'))

In [269]:
X_tfidf = tfidf.fit_transform(X.values.astype('U'))

In [270]:
len(tfidf.get_feature_names())

5000

In [271]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns = tfidf.get_feature_names())

In [272]:
df1 = pd.concat([originaltweets, X_tfidf_df], axis=1)

In [273]:
df1 = df1.reset_index(drop=True)

In [274]:
df1.head()

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,ability,able,abroad,...,yorker,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",US,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1416180635903868934,2021-07-16 23:39:09+00:00,Four Ways to Energize a Post-Pandemic Workforc...,four way energize post pandemic workforce,"Chicago, IL",US,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1416180231350759425,2021-07-16 23:37:33+00:00,🚑 These are the tools that will save your #Rem...,these tool that will save your google meet too...,"Duluth, GA",US,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [275]:
tfidf_col = df1.columns[7:]
tfidf_col_names = df1[tfidf_col]

In [276]:
X_tfidf=tfidf_col_names
sc = StandardScaler()
X_tfidf_sc = sc.fit_transform(X_tfidf)
X_tfidf_sc[0:1]

array([[-0.05189852, -0.0618364 , -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675]])

In [277]:
from sklearn.cluster import KMeans
km_tfidf = KMeans(n_clusters=15, random_state=42)
km_tfidf.fit(X_tfidf_sc)

KMeans(n_clusters=15, random_state=42)

In [278]:
for i in range(3, 16):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_tfidf_sc)
    print(i, kmeans.score(X_tfidf_sc), silhouette_score(X_tfidf_sc, kmeans.labels_))

3 -53853204.150502466 0.13619154215049453
4 -53784655.24951849 0.1403690832910357
5 -53822639.56548784 0.0025900435378184493
6 -53767219.906884596 -0.07658938270820985
7 -53788770.83686704 -0.07345027989711467
8 -53707294.91039878 -0.07294968911090385
9 -53686976.698391624 -0.0757667773964295
10 -53692503.896410026 -0.057694497174397244
11 -53696611.78304146 0.0043156890992473605
12 -53670807.706592105 -0.04295640326806475
13 -53652955.80997499 -0.08739204369348107
14 -53617808.062422514 -0.07361686569212382
15 -53546938.43827386 -0.06863490276000554


In [279]:
km_tfidf.cluster_centers_

array([[-0.05189852, -0.0618364 , -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675],
       [ 0.00622114,  0.00611376,  0.00189068, ...,  0.00198222,
        -0.00679986,  0.00945745],
       [-0.05189852, -0.0618364 , -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675],
       ...,
       [-0.05189852, -0.0618364 , -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675],
       [-0.05189852, -0.0618364 , -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675],
       [-0.05189852, -0.04845096, -0.01577257, ..., -0.01653625,
        -0.05011193, -0.07889675]])

In [280]:
df1['km']=km_tfidf.labels_

In [281]:
df1['km'].value_counts()

1     9627
14     934
4      156
2       17
8       12
11      10
13       9
0        3
12       3
10       2
7        2
3        2
6        2
9        1
5        1
Name: km, dtype: int64

In [282]:
silhouette_score(X_tfidf_sc, km_tfidf.labels_)

-0.06863490276000554

In [283]:
word_frequency_by_cluster = df1.groupby('km').sum()

In [284]:
word_frequency_by_cluster.head()

Unnamed: 0_level_0,tweet_id,polarity,ability,able,abroad,absence,absolute,absolutely,academia,accelerate,...,yorker,young,younger,youth,youtube,zdnet,zero,zoho,zone,zoom
km,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.24534e+18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.363577e+22,4861.0,8.955053,12.425575,1.504476,1.651445,1.192606,1.544211,0.726604,0.891511,...,1.438302,6.172018,1.925429,0.364912,6.458717,1.38673,7.313449,1.2647,7.458538,23.928749
2,2.407112e+19,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.833162e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.209219e+20,55.0,0.0,0.0,0.0,0.0,0.0,0.0,1.155528,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.205396,0.0


In [285]:
print("Top terms per cluster:")
order_centroids = km_tfidf.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(14):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :20]:
        print( ' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 designing
 elementor
 landing
 watching
 segment
 converting
 securely
 page
 return
 transition
 helping
 meet
 high
 expert
 shift
 leader
 making
 space
 office
 tip
Cluster 1:
 office
 working
 work
 employee
 home
 company
 hybrid
 time
 competitive
 meeting
 back
 productivity
 many
 compensation
 workplace
 benefit
 tip
 future
 keep
 read
Cluster 2:
 rule
 publication
 crash
 sunlight
 welcoming
 planned
 regulation
 rely
 dropping
 guideline
 minister
 toddler
 headline
 successful
 amount
 whose
 upgrade
 ground
 earnings
 updated
Cluster 3:
 loophole
 exploiting
 ordinary
 algorithm
 perpetual
 continuous
 clip
 secret
 month
 revenue
 exactly
 following
 right
 enjoy
 video
 income
 create
 people
 learn
 find
Cluster 4:
 alert
 market
 representative
 europe
 cvedia
 argyle
 italian
 development
 remotely
 europea
 compute
 hashrate
 dissimilar
 django
 sale
 unitedhealth
 telemarketing
 metabolic
 impacting
 outbound
Cluster 5:
 temora
 

### Word2Vec

In [286]:
corpus = api.load('text8')
model = Word2Vec(corpus)

In [287]:
#Code written by Caroline
def get_avg_vec(text):
    try:
        # make a list of only the words in the document that are in the word2vec vocabulary
        valid_words = [word for word in text.lower().split() if word in model.wv.key_to_index]
        print(model.wv.vocab)
        if len(valid_words) == 0:
            print();
            return np.zeros(shape=(100,))
        else:
            # return average word vector, for words in the document that exist in the vocab
            return np.mean([model.wv.get_vector(word) for word in valid_words], axis=0)
    except AttributeError:
        return np.zeros(shape=(100,))

In [288]:
avg_vecs = [get_avg_vec(doc) for doc in X]

In [289]:
vec_df = pd.DataFrame(avg_vecs)

In [290]:
vec_df.shape, originaltweets.shape

((10781, 100), (10781, 7))

In [291]:
df_vec = pd.concat([originaltweets, vec_df], axis=1)

In [292]:
df_vec = df_vec.reset_index(drop=True)

In [293]:
df_vec.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,0,1,2,...,90,91,92,93,94,95,96,97,98,99
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",US,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [294]:
vec_col = df_vec.columns[7:]
vec_col_names = df_vec[vec_col]

In [295]:
X_vec=vec_col_names
sc = StandardScaler()
X_vec_sc = sc.fit_transform(X_vec)
X_vec_sc[0:1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [296]:
for i in range(3, 16):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_vec_sc)
    print(i, kmeans.score(X_vec_sc), silhouette_score(X_vec_sc, kmeans.labels_))

  kmeans.fit(X_vec_sc)


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
from sklearn.cluster import KMeans
km_vec = KMeans(n_clusters=4, random_state=42)
km_vec.fit(X_vec_sc)

In [None]:
km_vec.cluster_centers_

In [None]:
df_vec['km']=km_vec.labels_

In [None]:
df_vec['km'].value_counts()

In [None]:
silhouette_score(X_vec_sc, km_vec.labels_)

In [None]:
frequency_by_cluster = df_vec.groupby('km').sum()

In [None]:
frequency_by_cluster.head()

In [None]:
print("Top terms per cluster:")
order_centroids = km_vec.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(14):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :20]:
        print( ' %s' % terms[ind])