In [15]:
#Import libraries
import pandas as pd
import numpy as np
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

In [16]:
#Read in 'final_df'
final_df = pd.read_csv("../dataset/final_df.csv")

In [17]:
#Check shape
final_df.shape

(22153, 9)

In [18]:
#Drop Unnamed: 0 column
final_df.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

In [19]:
final_df.head()

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity,user_location_cleaned
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",1,US
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,1,US
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,1,HK
3,1416180635903868934,2021-07-16 23:39:09+00:00,Four Ways to Energize a Post-Pandemic Workforc...,four way energize post pandemic workforce,"Chicago, IL",1,US
4,1416180231350759425,2021-07-16 23:37:33+00:00,🚑 These are the tools that will save your #Rem...,these tool that will save your google meet too...,"Duluth, GA",0,US


### Word2Vec

In [20]:
corpus = api.load('text8')
model = Word2Vec(corpus)

In [21]:
#Code written by Caroline
def get_avg_vec(text):
    try:
        # make a list of only the words in the document that are in the word2vec vocabulary
        valid_words = [word for word in text.lower().split() if word in model.wv.key_to_index]
        
        if len(valid_words) == 0:
            print();
            return np.zeros(shape=(100,))
        else:
            # return average word vector, for words in the document that exist in the vocab
            return np.mean([model.wv.get_vector(word) for word in valid_words], axis=0)
    except AttributeError:
        return np.zeros(shape=(100,))

In [22]:
avg_vecs = [get_avg_vec(doc) for doc in final_df['cleaned_tweets']]






















In [23]:
vec_df = pd.DataFrame(avg_vecs)

In [24]:
vec_df.shape, final_df.shape

((22153, 100), (22153, 7))

In [25]:
full_df = pd.concat([final_df, vec_df], axis=1)

In [26]:
full_df = full_df.reset_index(drop=True)

In [27]:
full_df.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity,user_location_cleaned,0,1,2,...,90,91,92,93,94,95,96,97,98,99
0,1416181616846811137,2021-07-16 23:43:03+00:00,🏡 #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal which mean need re...,"New York, NY",1,US,-0.232892,0.045137,0.252022,...,-0.559527,0.526009,-0.370559,0.203635,0.576165,0.588381,-0.521967,-0.337719,-0.641744,1.233857
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,1,US,-0.406955,-0.010148,0.524925,...,-0.321837,0.015106,-0.219817,0.072689,0.741292,0.199893,0.198371,-0.229032,-0.277164,0.456395
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open this excellent report from remo...,Sydney | Hong Kong | Singapore,1,HK,-0.540556,-0.015105,-0.010053,...,-0.773558,0.117311,0.255292,-0.049898,0.209365,0.392741,0.298359,-0.129394,-0.109167,1.18649


In [28]:
vec_col_names = full_df.columns[8:]

In [29]:
vec_cols = full_df[vec_col_names]

In [30]:
X = vec_cols
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled[0:1]

array([[ 0.45933389,  0.11893154,  0.02325688, -0.75199593, -0.67682415,
         0.50934007,  0.59468718,  0.71066326, -0.3708681 , -0.51910236,
         0.67516206,  0.43541813,  0.41514716, -0.9015428 , -0.20261201,
        -0.26893491, -0.38616375,  0.41263822,  0.13694739,  0.77472625,
         0.01615374,  0.02214339,  0.23288378,  0.45427049, -0.59009405,
         0.92692091,  0.70781464,  0.20497093, -0.51845983, -1.4574654 ,
        -0.02473626,  0.77509429, -0.04896273, -0.66941777,  0.68397035,
         0.16092113,  0.19443621,  0.62227035, -0.33920267, -1.041773  ,
        -0.14086848,  0.48966783,  0.07168247, -0.15173519,  1.16124908,
         0.89558642,  0.84417229, -0.44070965,  0.5420418 , -0.34129092,
        -0.29176251,  0.74713209,  0.48391114, -0.07007924, -0.90146416,
         0.0716813 ,  0.16994103,  0.96911958, -0.43569408,  1.19528877,
        -0.88284201, -0.0197148 , -0.85535173,  0.31247135, -0.81075215,
         0.38221127, -0.52140022, -0.47575838,  0.4

In [31]:
db = DBSCAN(eps=0.9, min_samples=20)
db.fit(X_scaled)

DBSCAN(eps=0.9, min_samples=20)

In [32]:
full_df['cluster_labels'] = db.labels_

In [33]:
full_df['cluster_labels'].value_counts()

-1     19845
 16      175
 45      153
 17      120
 3        97
 33       78
 14       70
 39       66
 18       61
 48       59
 11       57
 38       53
 36       53
 42       49
 49       48
 47       46
 10       45
 0        44
 5        43
 41       42
 40       41
 4        38
 8        37
 20       36
 24       36
 46       35
 44       34
 22       33
 32       33
 30       32
 23       31
 19       31
 31       30
 37       30
 34       29
 43       29
 50       29
 27       28
 13       28
 12       27
 6        27
 21       27
 29       26
 25       25
 51       24
 26       23
 35       22
 7        22
 2        22
 1        22
 28       21
 15       21
 9        20
Name: cluster_labels, dtype: int64

In [40]:
full_df[full_df['cluster_labels'] == 3]

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity,user_location_cleaned,0,1,2,...,91,92,93,94,95,96,97,98,99,cluster_labels
106,1416142942436302850,2021-07-16 21:09:22+00:00,RT @IncluzionJobs: .@mkristensen tweeted a new...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
107,1416142907959128069,2021-07-16 21:09:14+00:00,RT @IncluzionJobs: .@SEOcopy tweeted a new rem...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
108,1416142847770832896,2021-07-16 21:09:00+00:00,RT @IncluzionJobs: .@sdsnyouthusa tweeted a ne...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
109,1416142813482430464,2021-07-16 21:08:52+00:00,RT @IncluzionJobs: .@AgilityCMS tweeted a new ...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
110,1416142804280123396,2021-07-16 21:08:49+00:00,RT @IncluzionJobs: .@OSFramework tweeted a new...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5173,1414663122258997252,2021-07-12 19:09:06+00:00,RT @IncluzionJobs: .@bruvik tweeted a new remo...,tweeted remote,"Ontario, CA",0,US,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
5174,1414663057586933764,2021-07-12 19:08:50+00:00,RT @IncluzionJobs: .@Kierstensharris tweeted a...,tweeted remote,Everywhere,0,,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
5505,1416127198239526920,2021-07-16 20:06:49+00:00,RT @IncluzionJobs: .@NextRecNow tweeted a new ...,tweeted remote,Everywhere,0,,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3
12630,1415359945214730240,2021-07-14 17:18:01+00:00,RT @IncluzionJobs: .@CODEmagazine tweeted a ne...,tweeted remote,"Berlin, Deutschland",0,,-0.330702,1.24837,-0.322343,...,0.584491,0.00468,0.035828,1.486194,0.205414,0.690353,-1.712818,-0.462781,1.824123,3


In [41]:
full_df['tweets'].loc[5174]

'RT @IncluzionJobs: .@Kierstensharris tweeted a new remote job.\n\n#incluzion #remotejobs #wfh #ReignyDayJobs #remotework #workfromhome #remot…'