In [1]:
#Import libraries
import pandas as pd
import numpy as np
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

In [2]:
#Read in 'twitter_df'
twitter_df = pd.read_csv("../dataset/twitter_df.csv")

In [3]:
#Check shape
twitter_df.shape

(22153, 8)

In [5]:
#Drop Unnamed: 0 column
twitter_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
twitter_df.head()

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",US,1
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1
3,1416180635903868934,2021-07-16 23:39:09+00:00,Four Ways to Energize a Post-Pandemic Workforc...,four way energize post pandemic workforce,"Chicago, IL",US,1
4,1416180231350759425,2021-07-16 23:37:33+00:00,🚑 These are the tools that will save your #Rem...,these tool that will save your google meet too...,"Duluth, GA",US,0


In [7]:
twitter_df['user_location_cleaned'].value_counts()

US      9346
None    5284
IN      2255
GB      2242
DE       560
        ... 
SK         1
HN         1
GI         1
GP         1
KH         1
Name: user_location_cleaned, Length: 103, dtype: int64

### Word2Vec

In [8]:
corpus = api.load('text8')
model = Word2Vec(corpus)

In [9]:
#Code written by Caroline
def get_avg_vec(text):
    try:
        # make a list of only the words in the document that are in the word2vec vocabulary
        valid_words = [word for word in text.lower().split() if word in model.wv.key_to_index]
        
        if len(valid_words) == 0:
            print();
            return np.zeros(shape=(100,))
        else:
            # return average word vector, for words in the document that exist in the vocab
            return np.mean([model.wv.get_vector(word) for word in valid_words], axis=0)
    except AttributeError:
        return np.zeros(shape=(100,))

In [10]:
avg_vecs = [get_avg_vec(doc) for doc in twitter_df['cleaned_tweets']]






















In [11]:
vec_df = pd.DataFrame(avg_vecs)

In [12]:
vec_df.shape, twitter_df.shape

((22153, 100), (22153, 7))

In [13]:
full_df = pd.concat([twitter_df, vec_df], axis=1)

In [14]:
full_df = full_df.reset_index(drop=True)

In [15]:
full_df.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,0,1,2,...,90,91,92,93,94,95,96,97,98,99
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",US,1,-0.366511,0.066547,-0.012032,...,-0.134987,0.125815,0.012553,-0.024908,0.229517,0.772759,0.232187,0.158478,-0.271655,1.288791
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1,-0.387211,0.172549,0.040014,...,-0.171617,-0.465732,0.021583,-0.030787,0.464399,0.715457,0.417859,0.225182,-0.151503,0.667928
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1,-0.608523,-0.127039,-0.130338,...,-0.169229,0.375155,0.471006,-0.146473,-0.019812,0.915159,0.55278,0.393013,0.338137,1.18799


In [16]:
vec_col_names = full_df.columns[8:]

In [17]:
vec_cols = full_df[vec_col_names]

In [18]:
X = vec_cols
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled[0:1]

array([[ 0.4360485 ,  0.15054379,  0.22978401, -1.02705133, -0.53777489,
        -0.31727761,  0.94128391,  0.25681323,  0.09255891, -1.11245274,
         0.56475061, -0.25580502,  0.04954726, -0.48167907,  0.10660003,
        -0.52084498,  0.16229769, -0.40134335,  0.07167905,  0.82075667,
         0.18847967,  0.49689648, -0.11746416,  0.22020288, -1.17468133,
        -0.04646247,  0.61409439,  0.63099828, -0.28273385, -0.95541762,
        -0.67556149,  0.6219114 , -0.06722877, -0.22591564,  0.49946466,
         0.76011354,  0.21551787,  0.70589374, -0.53077102, -0.83700565,
        -0.65915156,  0.22772845, -0.09480693,  0.03483273,  1.15074505,
         1.00578387,  0.62939908, -1.05074177,  0.02739189, -0.28345159,
         0.17219831,  0.60474159,  0.61623413, -0.37604447, -0.38755314,
         0.06753014,  0.31295518,  1.3185351 , -0.78556553,  0.90113898,
        -0.78019131, -0.31247801, -1.77872629,  0.02167562, -0.80059402,
         0.27924546, -0.51065044, -0.70072105,  0.3

### DBSCAN

In [19]:
db = DBSCAN(eps=0.9, min_samples=20)
db.fit(X_scaled)

DBSCAN(eps=0.9, min_samples=20)

In [20]:
full_df['cluster_labels'] = db.labels_

In [21]:
full_df['cluster_labels'].value_counts()

-1     19845
 16      175
 34      153
 17      120
 3        97
 33       78
 14       70
 40       66
 18       61
 46       59
 11       57
 39       53
 37       53
 43       49
 47       48
 51       46
 10       45
 0        44
 5        43
 42       42
 41       41
 4        38
 8        37
 24       36
 20       36
 50       35
 45       34
 22       33
 32       33
 30       32
 23       31
 19       31
 31       30
 38       30
 35       29
 44       29
 48       29
 13       28
 27       28
 6        27
 21       27
 12       27
 29       26
 25       25
 49       24
 26       23
 36       22
 7        22
 2        22
 1        22
 15       21
 28       21
 9        20
Name: cluster_labels, dtype: int64

In [22]:
full_df[full_df['cluster_labels'] == 45]

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,0,1,2,...,91,92,93,94,95,96,97,98,99,cluster_labels
13991,1416696529390391303,2021-07-18 09:49:08+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,Burkina Faso,BF,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14001,1416683094787039234,2021-07-18 08:55:45+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,USA|EUROPE|ASIA|GLOBAL✈️,US,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14004,1416680503231582209,2021-07-18 08:45:27+00:00,"Success is not how high you have climbed, but ...",success high have climbed make positive differ...,Hamburg,IN,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14010,1416677157825351684,2021-07-18 08:32:09+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,Turkey / İstanbul,TR,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14048,1416653011804372992,2021-07-18 06:56:12+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,USA|EUROPE|ASIA|GLOBAL✈️,US,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14049,1416650261494382595,2021-07-18 06:45:17+00:00,"Success is not how high you have climbed, but ...",success high have climbed make positive differ...,Hamburg,IN,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14163,1416531592168853505,2021-07-17 22:53:44+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,"Chicago, IL",US,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14167,1416529699480633345,2021-07-17 22:46:12+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,USA|EUROPE|ASIA|GLOBAL✈️,US,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
14169,1416529482710663179,2021-07-17 22:45:21+00:00,"Success is not how high you have climbed, but ...",success high have climbed make positive differ...,Hamburg,IN,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45
15297,1414450023052554241,2021-07-12 05:02:19+00:00,RT @stratorob: Success is not how high you hav...,success high have climbed make positive differ...,USA|EUROPE|ASIA|GLOBAL✈️,US,1,-1.526718,-0.77715,-0.589884,...,-0.399362,-0.893083,-0.08526,-0.011316,1.206246,-0.492683,-1.581161,-0.062743,0.950039,45


In [23]:
full_df['tweets'].loc[22137]

'Workplace Flexibility in a Post-COVID World - https://t.co/3PX749AhaJ (via @WF_Institute) #RemoteWork #HR https://t.co/WuZ8rD3CdK'

### k-means

In [24]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=25)
km.fit(X_scaled)

KMeans(n_clusters=25)

In [25]:
km.cluster_centers_

array([[ 0.18741546,  0.14028999,  0.48671516, ..., -0.61167897,
        -0.21875977, -0.00556535],
       [-0.18426562, -0.79683273, -0.73211998, ...,  0.14526671,
         0.01188764,  0.21234578],
       [ 0.55224105,  0.8059345 ,  1.33368483, ...,  0.10100481,
         0.22900602, -0.25194267],
       ...,
       [-0.88423144,  1.0161835 ,  0.90407901, ...,  0.47258249,
        -1.0294113 ,  0.4543533 ],
       [-0.86954562, -0.90739232, -1.09949364, ...,  0.09064102,
        -0.23044143, -0.73410661],
       [-0.0670875 , -0.14582176, -0.22304136, ..., -0.10677175,
        -0.43820131, -0.16804584]])

In [26]:
full_df['km']=km.labels_

In [27]:
full_df['km'].value_counts()

18    2089
24    2054
4     1524
3     1415
6     1402
0     1388
17    1143
21    1132
19    1059
7     1058
9      979
12     949
2      918
13     795
15     746
1      693
11     626
23     576
16     478
8      400
22     305
20     278
14      98
10      32
5       16
Name: km, dtype: int64

In [28]:
full_df[full_df['km'] == 0]

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity,0,1,2,...,92,93,94,95,96,97,98,99,cluster_labels,km
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1,-0.387211,0.172549,0.040014,...,0.021583,-0.030787,0.464399,0.715457,0.417859,0.225182,-0.151503,0.667928,-1,0
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,HK,1,-0.608523,-0.127039,-0.130338,...,0.471006,-0.146473,-0.019812,0.915159,0.552780,0.393013,0.338137,1.187990,-1,0
40,1416167772875366404,2021-07-16 22:48:02+00:00,👀👋 Is this you? @toptalllc is hiring for a Dat...,this hiring data engineer worldwide,Global,,1,-0.382177,-0.688483,-0.096223,...,0.569183,0.346180,0.578697,1.245957,-0.201975,0.503347,0.003088,1.450583,-1,0
44,1416167769930883073,2021-07-16 22:48:02+00:00,👀👋 Is this you? @toptalllc is hiring for a Rea...,this hiring react native developer worldwide,Global,,1,-0.356146,-0.278961,-0.001065,...,0.551312,0.323737,0.779059,1.110619,-0.489455,0.535345,-0.279451,0.586307,-1,0
80,1416154743504719876,2021-07-16 21:56:16+00:00,🚨 .@HRExecMag's @BenefitsConf is currently see...,currently seeking speaker proposal next year e...,"Denver, CO",US,1,-0.099488,-0.176510,0.122632,...,-0.245328,0.209891,-0.174113,0.842181,0.135426,0.474888,-0.131507,1.320519,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22129,1418970994862174216,2021-07-24 16:27:03+00:00,Workplace Flexibility in a Post-COVID World - ...,workplace flexibility post covid world,South Florida,US,1,-0.566998,-0.232238,0.398531,...,-0.835927,-0.114315,0.381887,0.397489,-0.868317,-1.159313,-0.386782,0.552838,-1,0
22130,1418970972523221009,2021-07-24 16:26:57+00:00,The New Workplace Paradigm: Taking a High Toll...,workplace paradigm taking high toll worker eco...,"State College, PA",US,0,-1.052118,-0.142085,0.216530,...,-0.298820,-0.305085,0.163034,0.260499,0.586487,-0.783797,0.037178,0.711473,-1,0
22135,1418968495216340999,2021-07-24 16:17:07+00:00,RT @hrbartender: Workplace Flexibility in a Po...,workplace flexibility post covid world,"Berlin, Germany",DE,1,-0.566998,-0.232238,0.398531,...,-0.835927,-0.114315,0.381887,0.397489,-0.868317,-1.159313,-0.386782,0.552838,-1,0
22136,1418968476429996037,2021-07-24 16:17:02+00:00,Workplace Flexibility in a Post-COVID World - ...,workplace flexibility post covid world,"Gainesville, FL",US,1,-0.566998,-0.232238,0.398531,...,-0.835927,-0.114315,0.381887,0.397489,-0.868317,-1.159313,-0.386782,0.552838,-1,0


In [29]:
from sklearn.metrics import silhouette_score
sil_score = silhouette_score(X_scaled, km.labels_)
sil_score

0.030396481633765574