In [None]:
!python -m pip install pymongo
!python -m pip install pandas
!python -m spacy download en_core_web_sm


In [None]:
import itertools
import pymongo
import pandas as pd
import spacy
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [None]:
#this code's been found on stackoverflow at https://stackoverflow.com/a/16255680/10392851
#it allows to connect to mongodatabase and read data from it to store in dataframe

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)


    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [None]:
#reads data from mongodb into pandas dataframe

#here, include names of database and collection of your designated MongoDB server
database_name = "<INPUT>"
collection_name = "<INPUT>"
data = read_mongo(database_name,collection_name)

In [None]:
#filters out only tweets that are not retweets nor quotes

data_only_selftweets = data[(data['retweeted_status'].isnull()) & (data['quoted_status'].isnull())]

data_only_selftweets


In [None]:
#in the next few steps I modify the data frame so that for each tweet data it holds the full text in one column
#earlier, shorter tweets had its data in text, whereas longer ones in dict in "extended_tweet"

data_only_selftweets_text = data_only_selftweets[['text', 'extended_tweet']]
data_only_selftweets_text.extended_tweet = data_only_selftweets_text.extended_tweet.apply(lambda x: x["full_text"] if (type(x) == dict) else x)

#data_only_selftweets_text.fillna(data_only_selftweets_text['text']).head()

In [None]:
data_only_selftweets_text['extended_tweet'] = data_only_selftweets_text['extended_tweet'].fillna(data_only_selftweets_text['text'])

data_justtext = data_only_selftweets_text.drop(['text'],axis=1)

In [None]:
#this is the original dataframe with filtered out retweets and full text of tweets in one column which will be useful for clustering

data_only_selftweets.text = data_justtext

data_only_selftweets

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
print(nlp.pipeline)
print(nlp.pipe_names)
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
# Verify they are empty.
print(nlp.pipeline)


In [None]:
#tokenize and normalize functions needed to improve clustering

def spacy_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    tokens.append(token)
  return tokens

def normalize(tokens):
  normalized = list()
  for token in tokens:
    if (token.is_alpha or token.is_digit):
      lemma = token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
      normalized.append(lemma)
  return normalized

def tokenize_normalize(string):
    return normalize(spacy_tokenize(string))
# A function that given an input query item returns the top-k most similar items 
# by their cosine similarity.
def find_similar(query_vector, td_matrix, top_k = 5):
    cosine_similarities = cosine_similarity(query_vector, td_matrix).flatten()
    related_doc_indices = cosine_similarities.argsort()[::-1]
    return [(index, cosine_similarities[index]) for index in related_doc_indices][0:top_k]

In [None]:
#tokenize text field
tweets_tokenized = data_only_selftweets.text.apply(spacy_tokenize)

#print(tweets_tokenized)

In [None]:
post_vals = data_only_selftweets.text
ngram_vectorizer = TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf=True, max_features=50000, ngram_range=(1,2))
ngram_document_term_matrix = ngram_vectorizer.fit_transform(post_vals)

In [None]:
#this section performs the clustering on text data from collected tweets

document_matrix = ngram_document_term_matrix
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, init='random', n_init=5, verbose=10)
kmeans.fit(document_matrix)

In [None]:
#here, most common keywords in each grouping of tweets are printed

order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = ngram_vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)

    for ind in order_centroids[i, :20]:
        print(' %s' % terms[ind])