In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import string
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction import text

INTRODUCTION:
    
My project is about analyzing tweets by using a workflow that looks something like this: 
read in data -->  clean text data (regex/nltk) --> tokenize words -->  
reduce dimensions (sklearn SVD, sklearn PCA, sklearn TFIDFVectorizer) -->  KMeans clustering --> Check Topics from clusters from most common words.


The goal of this project was to use unsupervised learning techniques to attempt to draw some meaning from a huge corpus made of tweets regarding Covid-19. This project sparked my interest because I lived through the whole pandemic and have seen how panic in 2020 has turned into nonchalant, casual mentions of Covid-19 in 2022.  I wanted to revisit the early days of the pandemic and see what people were most concerned about by checking their tweets. 



In [None]:
#reading in data
df2 = pd.read_csv('Corona_NLP_test.csv')
df3 = pd.read_csv('Corona_NLP_train.csv', encoding='latin-1')
df_comb = pd.concat([df2, df3])

In [None]:
#checking to make sure all dfs combined correctly and checking what data looks like
print(df_comb.shape)
print(df2.shape)
print(df3.shape)
df_comb.head(5)

In [None]:
#renaming df to simply 'df'
df= df_comb
#35977 after re
#8 after min_df =.1
#191 after min_df = .01

for x in df['OriginalTweet']:
    x = re.sub('@\S+', ' ', x)
    x = re.sub('https*\S+', ' ', x)
    x = re.sub('#\S+', ' ', x)
    x = re.sub("\'\w+", '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub('\s{2,}', ' ', x)
    #x = re.sub('/^[a-zA-Z0-9\s]*$/g', '', x)
    df['OriginalTweet'] = x

In [None]:
#cleaning tweets using regex

stop_words = text.ENGLISH_STOP_WORDS
def text_preproc(x):
    
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x
df['CleanTweet'] = df['OriginalTweet'].apply(text_preproc)



In [None]:

#tokenizing the tweets
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text


my_stop_words = text.ENGLISH_STOP_WORDS
vectorizer = TfidfVectorizer(stop_words=my_stop_words, min_df = .01) 
X = vectorizer.fit_transform(df['CleanTweet'])
tokens = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tokens

In [None]:
#CLEANING TOKENS WITH STEMMER AND MORE

In [None]:
from nltk.stem import SnowballStemmer

In [None]:
#putting stemmed words into a list called 'stem_words'
snow_stemmer = SnowballStemmer(language='english')
words = vectorizer.get_feature_names()
stem_words = []
for w in words:
    y = snow_stemmer.stem(w)
    stem_words.append(y)
print(stem_words)

In [None]:
#Doing some LSA and Topic Modeling to try and isolate topics
from sklearn.decomposition import TruncatedSVD

In [None]:
#note this one takes a bit to run, fitting the lsa model
lsa = TruncatedSVD(n_components=2)
lsa.fit(tokens)

In [None]:
#fitting x2 to use in kmeans graph display
X2 = lsa.fit_transform(tokens)

In [None]:
len(lsa.components_)
X2.shape

In [None]:
vectorizer.get_feature_names()

In [None]:
#k means clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans()

In [None]:
km.fit(tokens)

In [None]:
#displaying kmeans in graph using this guide, note df = X2, instead of pca transform i used lsa 
#https://www.askpython.com/python/examples/plot-k-means-clusters-python
centroids = km.cluster_centers_
X2 = lsa.fit_transform(tokens)
label = km.fit_predict(X2)
u_labels = np.unique(label)
for i in u_labels:
    plt.scatter(X2[label == i, 0], X2[label == i, 1], label = i)
plt.scatter(centroids[:,0], centroids[:,1], s = 80, color = 'k')
plt.legend()
plt.show()

In [None]:
#reducing X_test dimensions (cols) using PCA

In [None]:
X_test = X.toarray()
X_test.shape

In [None]:
print(type(X_test))
print(type(tokens))
print(type(X))

In [None]:
#trying to reduce X_test from (44944,191)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_PCA = pca.fit_transform(X_test)
X_PCA.shape

In [None]:
'''
trying to plot the data
'''
xs, ys = X_PCA[:,0], X_PCA[:,1]
fig, ax = plt.subplots(figsize=(9,6))
ax.margins(.05)
ax.plot(xs, ys, marker='+', linestyle='', ms=3)
ax.set_aspect('auto')

plt.show()
#fig.savefig('tweets_after_PCA.png', dpi=fig.dpi) #to save the figure if you want 

#kmeans attempt 

centroids = km.cluster_centers_
label = km.fit_predict(X_PCA)
u_labels = np.unique(label)
for i in u_labels:
    plt.scatter(X_PCA[label == i, 0], X_PCA[label == i, 1], label = i)
plt.scatter(centroids[:,0], centroids[:,1], s = 80, color = 'k')
plt.legend()
plt.show()

In [None]:
#kmeans attempt 2
km2 = KMeans(n_clusters=8, random_state=1)
km2.fit(tokens)

centroids2 = km2.cluster_centers_
label2 = km2.fit_predict(X_PCA)
u_labels2 = np.unique(label2)
for i in u_labels2:
    fig = plt.scatter(X_PCA[label2 == i, 0], X_PCA[label2 == i, 1], label = i)
plt.scatter(centroids2[:,0], centroids2[:,1], s = .001, color = 'k')
plt.legend()
plt.show()

#fig.figure.savefig('kmeans_8k.png') #to save the image if you want

In [None]:
df_comb[km.labels_==1]

In [None]:
back_track_one = pca.inverse_transform(X_PCA)

In [None]:
type(back_track_one)

In [None]:
back_track_one

In [None]:
#looking at token martix in df for cluster = i
df_clust0 = tokens[km2.labels_==0]
df_clust1 = tokens[km2.labels_==1]
df_clust2 = tokens[km2.labels_==2]
df_clust3 = tokens[km2.labels_==3]
df_clust4 = tokens[km2.labels_==4]
df_clust5 = tokens[km2.labels_==5]
df_clust6 = tokens[km2.labels_==6]
df_clust7 = tokens[km2.labels_==7]


In [None]:
clust0_words = df_clust0.sum() #supermarket shortages with words: sanitizer, panic, toilet, paper, buying, food
clust1_words = df_clust1.sum() #changes since covid-19 with words: covid-19, consumer, prices, pandemic, online, shopping, impact, oil, crisis
clust2_words = df_clust2.sum() #people concerned about going to grocery store: store, grocery, workers, people, 'just like going', stores, employees
clust3_words = df_clust3.sum() #
clust4_words = df_clust4.sum()
clust5_words = df_clust5.sum()
clust6_words = df_clust6.sum()
clust7_words = df_clust7.sum()

In [None]:
#printing 10 most common words in each cluster based on tfidf value
print('Cluster0 Words' + '\n',  clust0_words.sort_values(ascending=False)[:10], '\n')
print('Cluster1 Words' + '\n',  clust1_words.sort_values(ascending=False)[:10], '\n')
print('Cluster2 Words' + '\n',  clust2_words.sort_values(ascending=False)[:10], '\n')
print('Cluster3 Words' + '\n',  clust3_words.sort_values(ascending=False)[:10], '\n')
print('Cluster4 Words' + '\n',  clust4_words.sort_values(ascending=False)[:10], '\n')
print('Cluster5 Words' + '\n',  clust5_words.sort_values(ascending=False)[:10], '\n')
print('Cluster6 Words' + '\n',  clust6_words.sort_values(ascending=False)[:10], '\n')
print('Cluster7 Words' + '\n',  clust7_words.sort_values(ascending=False)[:10], '\n')

In [None]:
#testing with different number of clusters
#trying to use inertia/elbow plot with km3 as our tester

In [None]:
#determining inertia of kmeans clusters (higher = worse)
inertia = []

for num_clusters in range(1,20):
    km3 = KMeans(n_clusters=num_clusters, random_state=1)
    km3.fit(tokens)
    inertia.append(km3.inertia_)
inertia

In [None]:
#plotting inertia (y) to number of clusters in kmeans (x)
inertia_plot = plt.plot(inertia)

plt.xlabel('Num Clusters')
plt.ylabel('Inertia')
plt.savefig('inertia_plot.jpeg')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
#putting sil_score and inertia in lists for easy viewing, checking for a range of clusters to determine the best cluster size (k) to use

sil_score = []
inertia = []

for num_clusters in range(2,11):
    km3 = KMeans(n_clusters=num_clusters, random_state=11)
    km3.fit(tokens)
    inertia.append(km3.inertia_)
    sil_score.append(silhouette_score(tokens, km3.labels_, metric='euclidean'))

print(inertia)
print(sil_score)

In [None]:
#trying a DBSCAN cluster to see if it clusters in a better way than kmeans
from sklearn.cluster import DBSCAN
from random import randint
from sklearn.preprocessing import StandardScaler

In [None]:

#scaling X_PCA to X_DB
X_DB = X_PCA
X_DB = StandardScaler().fit_transform(X_DB)
plt.scatter(X_DB[:, 0], X_DB[:, 1])

DB_cluster = DBSCAN(eps=.1, min_samples=55).fit(X_DB)
y_pred = DB_cluster.labels_
#colors=np.array(['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'pink', 'salmon', 'olive', 'aqua', 'crimson', 'mediumslateblue', 'steelblue', 'black'])
colors = []
for i in range(0, 50):
    colors.append('#%06X' % randint(0, 0xFFFFFF))
colors = np.array(colors)
plt.scatter(X_DB[:, 0], X_DB[:, 1], color=colors[y_pred])

#checking how many clusters/labels the DBSCAN has because of tuning parameters
np.unique(DB_cluster.labels_)

In [None]:
#checking most common words for DBACAN clusters... it seems like 15 clusters may be best... but there is only 2-3 main topics and a lot of noise
df_clust0 = tokens[DB_cluster.labels_==0]
df_clust1 = tokens[DB_cluster.labels_==1]
df_clust2 = tokens[DB_cluster.labels_==2]
df_clust3 = tokens[DB_cluster.labels_==3]
df_clust4 = tokens[DB_cluster.labels_==4]
df_clust5 = tokens[DB_cluster.labels_==5]
#df_clust6 = tokens[DB_cluster.labels_==6]
#df_clust7 = tokens[DB_cluster.labels_==7]

clust0_words = df_clust0.sum() #
clust1_words = df_clust1.sum() #
clust2_words = df_clust2.sum() #
clust3_words = df_clust3.sum() #
clust4_words = df_clust4.sum()
clust5_words = df_clust5.sum()
#clust6_words = df_clust6.sum()
#clust7_words = df_clust7.sum()



print('Cluster0 Words' + '\n',  clust0_words.sort_values(ascending=False)[:10], '\n')
print('Cluster1 Words' + '\n',  clust1_words.sort_values(ascending=False)[:10], '\n')
print('Cluster2 Words' + '\n',  clust2_words.sort_values(ascending=False)[:10], '\n')
print('Cluster3 Words' + '\n',  clust3_words.sort_values(ascending=False)[:10], '\n')
print('Cluster4 Words' + '\n',  clust4_words.sort_values(ascending=False)[:10], '\n')
print('Cluster5 Words' + '\n',  clust5_words.sort_values(ascending=False)[:10], '\n')
#print('Cluster6 Words' + '\n',  clust6_words.sort_values(ascending=False)[:10], '\n')
#print('Cluster7 Words' + '\n',  clust7_words.sort_values(ascending=False)[:10], '\n')


In [None]:
#checking the most used words from the whole tweet dataset, not just a single cluster
most_pop_words_overall = tokens.sum()
most_pop_words_overall.sort_values(ascending=False)[:10]

#gathering some stuff for the presentation

proj_tweet = []
proj_tweet.append(df['OriginalTweet'].head(1))

test_tweet = df['OriginalTweet'].iloc[1].split()
test_tweet=' '.join(map(str, test_tweet))
print(test_tweet)

test_clean_tweet = df['CleanTweet'].iloc[1].split()
test_clean_tweet=' '.join(map(str,test_clean_tweet))
print(test_clean_tweet)