In [None]:
#Cluster Docs
#The task here is whether we can cluster the tweets into meaningful categories
# uses some concepts from vectorize_pca.ipynb

#To Do:
# filter out duplicate type statements
# optimize DBSCAN

In [16]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import spacy
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv("train.csv")
nlp = spacy.load("en_core_web_md")

In [4]:
train_df = train_df.drop(columns = ['keyword', 'location'])

In [5]:
train_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
def vectorize(item):
    doc = nlp(item)
    return doc.vector

In [7]:
#Get document vectors
train_vectorized = pd.DataFrame(np.vstack([vectorize(entry) for entry in train_df.text]))

In [11]:
final_df = pd.concat([train_df[['id']], train_vectorized], axis = 1)

In [12]:
final_df

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,1,-0.278979,0.103926,-0.099896,-0.033250,0.006406,-0.127988,0.015576,0.162036,0.019861,...,-0.308163,0.127427,-0.153923,-0.059484,0.200914,-0.077234,-0.056179,0.045646,-0.063259,-0.056399
1,4,-0.064835,0.060354,-0.166393,-0.344203,0.306547,0.060459,0.205219,-0.072670,-0.099532,...,-0.063258,0.121803,0.005085,-0.031410,0.151826,0.067330,0.063014,-0.075697,-0.233015,0.096144
2,5,-0.024668,0.059286,-0.071322,-0.074969,0.103867,-0.124814,0.033703,-0.014304,-0.062268,...,-0.208227,0.020313,0.125149,-0.040904,-0.067651,-0.032820,-0.193037,-0.054479,0.056526,0.039146
3,6,-0.167660,0.244378,0.077987,-0.135558,-0.028106,-0.211794,-0.039319,0.010273,0.064539,...,-0.182993,0.181690,0.172492,-0.032437,-0.200355,-0.176960,-0.162615,-0.040300,0.026279,-0.030382
4,7,-0.068673,0.062337,0.030137,0.077085,0.084789,0.005437,-0.042100,-0.084338,-0.041983,...,-0.056272,0.148307,0.154113,0.114311,0.023254,-0.064449,-0.169256,-0.071676,-0.166943,0.135751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.159007,-0.104432,-0.148163,0.053784,0.262106,-0.094258,-0.237223,0.077176,0.130976,...,-0.307486,0.030227,0.256498,-0.094045,-0.289813,0.033338,-0.003776,-0.099587,0.093601,0.087183
7609,10870,-0.028086,0.120067,-0.007104,-0.130004,0.085086,0.040323,0.000081,0.057641,0.009555,...,-0.226201,0.097639,-0.012005,-0.141960,-0.135343,0.050099,-0.030713,-0.038417,-0.127949,0.020050
7610,10871,0.042328,0.088143,-0.060020,-0.255870,0.167321,0.142800,0.004191,0.035462,-0.009555,...,0.100125,0.076964,-0.051817,0.002984,0.185786,0.048712,-0.043791,0.093991,-0.041435,0.056906
7611,10872,-0.030696,0.192540,-0.218216,0.058709,0.194752,-0.069732,0.053346,-0.148824,-0.168566,...,-0.141955,-0.063845,0.088926,-0.177305,-0.063511,0.014212,0.155658,-0.077130,0.040479,0.062095


In [13]:
kmeans = KMeans()

In [14]:
kmeans.fit(final_df.drop(columns = 'id'))

KMeans()

In [15]:
kmeans.labels_


array([5, 0, 5, ..., 0, 1, 1], dtype=int32)

In [None]:
# we do get clusters, but we Scikit KMeans only does euclidean distance, while ideally, we want to use cosine similarity

In [30]:
dbscan = DBSCAN(metric = 'cosine', eps = 0.1)

In [31]:
dbscan.fit(final_df.drop(columns= 'id'))

DBSCAN(eps=0.1, metric='cosine')

In [41]:
cluster_map = pd.concat([final_df[['id']], pd.DataFrame(dbscan.labels_, columns = ['cluster'])], axis = 1)

In [42]:
cluster_map.head()

Unnamed: 0,id,cluster
0,1,0
1,4,-1
2,5,0
3,6,-1
4,7,-1


In [48]:
master_df = pd.merge(cluster_map, train_df)

In [50]:
master_df.head()

Unnamed: 0,id,cluster,text,target
0,1,0,Our Deeds are the Reason of this #earthquake M...,1
1,4,-1,Forest fire near La Ronge Sask. Canada,1
2,5,0,All residents asked to 'shelter in place' are ...,1
3,6,-1,"13,000 people receive #wildfires evacuation or...",1
4,7,-1,Just got sent this photo from Ruby #Alaska as ...,1


In [70]:
master_df[master_df['cluster'] == 5]
# this is really interesting because there seems to be duplicates, and yet target seems to be inconsistent

Unnamed: 0,id,cluster,text,target
246,349,5,U.S National Park Services Tonto National Fore...,0
250,355,5,U.S National Park Services Tonto National Fore...,0
251,356,5,U.S National Park Services Tonto National Fore...,1
253,360,5,U.S National Park Services Tonto National Fore...,1
259,370,5,U.S National Park Services Tonto National Fore...,0
264,381,5,Stop the Annihilation of the Salt River Wild H...,1
266,384,5,U.S National Park Services Tonto National Fore...,0
270,393,5,U.S National Park Services Tonto National Fore...,0
271,394,5,U.S National Park Services Tonto National Fore...,1


In [69]:
# to test for above, let's take a look at the original dataframe (confirmed in the original--> need to remove duplicates then)
train_df[train_df['id'].isin([349,355, 356, 360, 370, 381, 384, 393, 394])]

Unnamed: 0,id,text,target
246,349,U.S National Park Services Tonto National Fore...,0
250,355,U.S National Park Services Tonto National Fore...,0
251,356,U.S National Park Services Tonto National Fore...,1
253,360,U.S National Park Services Tonto National Fore...,1
259,370,U.S National Park Services Tonto National Fore...,0
264,381,Stop the Annihilation of the Salt River Wild H...,1
266,384,U.S National Park Services Tonto National Fore...,0
270,393,U.S National Park Services Tonto National Fore...,0
271,394,U.S National Park Services Tonto National Fore...,1


In [71]:
list(master_df[master_df['cluster'] == 5].text)

['U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... http://t.co/6LoJOoROuk via @Change',
 'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/m8MvDSPJp7 via @Change',
 'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/sW1sBua3mN via @Change',
 'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... http://t.co/KPQk0C4G0M via @Change',
 'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/MatIJwkzbh via @Change',
 'Stop the Annihilation of the Salt River Wild Horses! http://t.co/wVobVVtXKg via @Change',
 'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/x2Wn7O2a3w via @Change',
 'U.S National Park Services Tonto National Fo

Interestingly, everything from this cluster is the same tweet, but just from a different url. Need to be able to filter these out with some sort of rule