# Please rememeber to put the path of file in the "with open section"


In [1]:
import pymongo
from pymongo import MongoClient
# you have to install mongodb using command line before using the code below

In [2]:
client = MongoClient('localhost', 27017)
db = client['twitter_db']
collection = db['twitter_collection']
import json
with open("tweets", "r") as f1:
    for line in f1:
        try:
            data = json.loads(line)
            # if you want to see a specific field, you can print it. 
            # if your file is big, there may be too many of these printed
            # print(data['text'])
            # insert data into MongoDB
            collection.insert_one(data)
        except:
            continue

FileNotFoundError: [Errno 2] No such file or directory: 'tweets'

# Process Data in MongoDB

In [1]:
# import packages 
import pymongo
from pymongo import MongoClient

# Define client, db, collection
client = MongoClient('localhost', 27017)
db = client['twitter_db']
collection = db['twitter_collection']

# find one record in twitter_collection
x = collection.find_one()
# print(x)

In [2]:
# check how many tweets have been retweeted 
cursor = db.twitter_collection.find({"retweeted_status": {"$exists": True}})
num_of_tweets_retweeted = 0 
for doc in cursor:
    num_of_tweets_retweeted += 1
print(num_of_tweets_retweeted)

15633


## Create several indexes in this table for fast access

If not creating indexes, it may require traversal the whole table to find the information. With index, (MongoDB stores indexes using BTree data structure), the search time complexity can be reduced a lot. (Generally, can from O(N) to O(logN))

In [3]:
# create a single filed index for users' followers_count to speed up the access for each single tweet based 
# on the number of their followers
collection.create_index([("user.followers_count", pymongo.DESCENDING)])

# create an index on "created_at"
collection.create_index([("created_at", pymongo.DESCENDING)])

# create an index on "retweeted_status.retweet_count"
collection.create_index([("retweeted_status.retweet_count", pymongo.DESCENDING)])

# create an index on "retweeted_status.reply_count"
collection.create_index([("retweeted_status.reply_count", pymongo.DESCENDING)])

# look at executionTimeMillis for search time, the smaller executionTimeMillis is better 
# cursor.explain()

'retweeted_status.reply_count_-1'

## Create another collection named "first_100_tweets_table" 

In [4]:
# Create another collection named first_100_tweets_table
collection_2 = db['first_100_tweets_table']

### The following chunk of code can only be run once, and it insert the first 100 tweets into this collection one by one

In [5]:
#first_100_tweets = db.twitter_collection.find().sort("created_at", -1).limit(100)
#for doc in first_100_tweets:
#    collection_2.insert_one(doc)

In [6]:
# find one doc in the new collection "first_100_tweets_table"
y = collection_2.find_one()
#print(y)

## Make a deep copy of the first_100_tweets_table in this program, and it will be used as a in program database, which can be accessed faster than the database

In [7]:
# make a deep copy of the first_100_tweets_table in this program, and it can be accessed faster than the database
cache_in_program = []
cursor = db.first_100_tweets_table.find()
for doc in cursor:
    cache_in_program.append(doc)

## Make some queries using MongoDB

In [8]:
# find the number of tweets in this database 
cursor = db.twitter_collection.find()
num_of_tweets = 0 
for doc in cursor:
    num_of_tweets += 1
print(num_of_tweets)

19171


In [9]:
# find the content of the newest tweet
newest_tweet = db.first_100_tweets_table.find().limit(1) 
for doc in newest_tweet:
    content_of_newest_tweet = doc['text']
    print(content_of_newest_tweet)

RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…


In [10]:
# find the time of the latest tweet created in this database
newest_tweet = db.first_100_tweets_table.find().limit(1) 
for doc in newest_tweet:
    created_time_of_newest_tweet = doc['created_at']
print(created_time_of_newest_tweet)

Wed Apr 15 00:56:34 +0000 2020


In [11]:
# find the user id of the user who has the largest number of followers in this database
user_with_largest_num_of_followers = db.twitter_collection.find({}, {"user.id", "user.name", "user.followers_count"}).sort("user.followers_count", -1).limit(1)
for doc in user_with_largest_num_of_followers:
    user_id_with_largest_num_of_followers = doc['user']['id']
    user_name_with_largest_num_of_followers = doc['user']['name']
print("user id: " + str(user_id_with_largest_num_of_followers) + "\nuser name: " + user_name_with_largest_num_of_followers)

user id: 1115874631
user name: CGTN


In [12]:
# find how many users in this database have more than 100k followers
cursor = db.twitter_collection.find({"user.followers_count": {"$gt": 100000}}, {"user.id", "user.followers_count"})
num_of_users_with_gt100k_followers = 0 
for doc in cursor:
    num_of_users_with_gt100k_followers += 1
print(num_of_users_with_gt100k_followers)

167


In [13]:
# find the tweets with most retweets
most_retweets = db.twitter_collection.find({}, {"retweeted_status.text", "retweeted_status.retweet_count"}).sort("retweeted_status.retweet_count", -1).limit(5)
for doc in most_retweets:
    tweets_with_most_retweets = doc['retweeted_status']['text']
    print(tweets_with_most_retweets)

All the other countries are making the US look like it’s being ran by a moron
All the other countries are making the US look like it’s being ran by a moron
All the other countries are making the US look like it’s being ran by a moron
All the other countries are making the US look like it’s being ran by a moron
All the other countries are making the US look like it’s being ran by a moron


In [14]:
# find tweets made by users who are verified
verified = db.twitter_collection.find({"user.verified": {"$exists": True}})
for doc in verified:
    verified_tweets = doc['text']
    verified_tweets_name = doc['user']['name']
    #print("Name:", verified_tweets_name, "\nTweet:", verified_tweets)

In [15]:
# get the average character length of all the tweets
all_tweets = db.twitter_collection.find()
sum = 0
for doc in all_tweets:
    tweet = doc['text']
    sum += len(tweet)
    
average_length = sum/num_of_tweets
print("The average length of a tweet in this database is",average_length, "characters.")

The average length of a tweet in this database is 123.95425382087528 characters.


In [16]:
# find tweets with the most replies (most controversial)
most_replies = db.twitter_collection.find({}, {"retweeted_status.text", "retweeted_status.reply_count"}).sort("retweeted_status.reply_count", -1).limit(5)
for doc in most_replies:
    length_of_tweets_with_most_replies = len(doc['retweeted_status']['text'])
    print(length_of_tweets_with_most_replies)
    tweets_with_most_replies = doc['retweeted_status']['text']
    print(tweets_with_most_replies)
    print()

140
Everyone, young and old, needs to act now to slow the spread of COVID-19.  The best thing Americans can to do fight… https://t.co/t9BooCw8QS

140
Everyone, young and old, needs to act now to slow the spread of COVID-19.  The best thing Americans can to do fight… https://t.co/t9BooCw8QS

140
Everyone, young and old, needs to act now to slow the spread of COVID-19.  The best thing Americans can to do fight… https://t.co/t9BooCw8QS

140
Preliminary investigations conducted by the Chinese authorities have found no clear evidence of human-to-human tran… https://t.co/1GHUbI2YXm

140
Preliminary investigations conducted by the Chinese authorities have found no clear evidence of human-to-human tran… https://t.co/1GHUbI2YXm



In [17]:
# # getting tweets that are blow a certain character length
# length = 40
# all_tweets = db.twitter_collection.find({}, {"retweeted_status", "retweeted_status.text"})
# for doc in all_tweets:
#     # if tweet is less than desired length, add it
#     if (len(doc['retweeted_status']['text']) < length):
#         short_tweets = doc['retweeted_status']['text']
#         #print(short_tweets)


In [18]:
# find tweets from people with the most followers
most_followers = db.twitter_collection.find({}, {"user.id", "user.followers_count", "text"}).sort("user.followers_count", -1).limit(2)
for doc in most_followers:
    user_id_with_most_followers = doc['user']['id']
    follower_count = doc['user']['followers_count']
    tweets_by_most_followers = doc['text']
    tweets_from_people_with_most_followers = "User ID: " + str(user_id_with_most_followers) + "\nNumber of followers: " + str(follower_count) + "\nTweet: " + tweets_by_most_followers
    print(tweets_from_people_with_most_followers)

User ID: 1115874631
Number of followers: 14024195
Tweet: Numbers from the Chinese mainland on Tuesday: one new #COVID19 death in #Hubei; 46 new cases (36 originating abroad… https://t.co/Fw6RKZoDQf
User ID: 37034483
Number of followers: 12616711
Tweet: Here's how drones helping India in fight against #coronavirus https://t.co/ATdiBectqs https://t.co/Cx6dH0Hc2O


## Prepare answers to some common questions in the program which will be put into cache later 

In [19]:
common_questions = {
    "Number of tweets in database?": num_of_tweets,
    "What is the content of the newest tweet?": content_of_newest_tweet,
    "What time is the latest tweet created in this database?": created_time_of_newest_tweet,
    "What is the user id of the user who has the largest number of followers in this database?": "user id: " + str(user_id_with_largest_num_of_followers) + "\nuser name: " + user_name_with_largest_num_of_followers,
    "What is the content of the tweet with most retweets? ": tweets_with_most_retweets,
    "How many users in this database have more than 100k followers?": num_of_users_with_gt100k_followers,
    "What is the average length of a tweet in this database?": average_length,
    "What is the tweet with most replies, and how many replies it gets?": tweets_with_most_replies + "\n" + str(length_of_tweets_with_most_replies) + " replies",
    "What is the tweet from people with most followers? ": tweets_from_people_with_most_followers
    
}

In [20]:
for i, val in common_questions.items():
    print(i, val)

Number of tweets in database? 19171
What is the content of the newest tweet? RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…
What time is the latest tweet created in this database? Wed Apr 15 00:56:34 +0000 2020
What is the user id of the user who has the largest number of followers in this database? user id: 1115874631
user name: CGTN
What is the content of the tweet with most retweets?  All the other countries are making the US look like it’s being ran by a moron
How many users in this database have more than 100k followers? 167
What is the average length of a tweet in this database? 123.95425382087528
What is the tweet with most replies, and how many replies it gets? Preliminary investigations conducted by the Chinese authorities have found no clear evidence of human-to-human tran… https://t.co/1GHUbI2YXm
140 replies
What is the tweet from people with most followers?  User ID: 37034483
Numbe

# LRU Cache for questions asked
### LRU Cache is a cache replacement algorithm that removes the least recently used data in order to make room for new data.

In [21]:
# LRU Cache requires the linkedlist structure, but Python does not have it, so we have to create a LinkedNode class
# key is the question, value is the answer to that question, next is the next question of this question in the cache
class LinkedNode:
    
    def __init__(self, key=None, value=None, next=None):
        self.key = key
        self.value = value
        self.next = next

In [22]:
# LRU Cache is a cache replacement algorithm that removes the least recently used data in order to make room for 
# new data.
class LRUCache:
    
        # This is the initial function to define 
        # Dictionary(key_to_prev): key is the question, value is the linkednode of the previous question
        # LinkedNode(dummy): the linkednode without value to point out the memory address of the linkedlist
        # tail: the tail of the linkedlist, and its initial value is dummy
        # Integer(capacity): define of the size of the cache, which is the same as the length of the linkedlist
        def __init__(self, capacity):
            self.key_to_prev = {}
            self.dummy = LinkedNode()
            self.tail = self.dummy
            self.capacity = capacity
        
        # The function push_back is to put the question node at the tail of the linkedlist
        def push_back(self, node):
            self.key_to_prev[node.key] = self.tail
            self.tail.next = node
            self.tail = node
        
        # The function pop_front is to delete the head node from the linkedlist and the next node becomes the new head
        def pop_front(self):
            head = self.dummy.next
            del self.key_to_prev[head.key]
            self.dummy.next = head.next
            self.key_to_prev[head.next.key] = self.dummy
            
        # The function kick is to move the prev node's next to the tail of the linkedlist
        def kick(self, prev): 
            node = prev.next
            if node == self.tail:
                return
        
            # remove the current node from linked list
            prev.next = node.next
            # update the previous node in hash map
            self.key_to_prev[node.next.key] = prev
            node.next = None

            self.push_back(node)
        
        # The function get is to get the value(answer) of the key(question)
        # If the question is not in the linkedlist, it will return -1, and we need go to the database to find answers
        # Else: it will return the value(answer) of the key(question)
        def get(self, key):
            if key not in self.key_to_prev:
                return -1
        
            prev = self.key_to_prev[key]
            current = prev.next
        
            self.kick(prev)
            return current.value
        
        # The function put is to put a question into the linkedlist
        def set(self, key, value):
            # If the question is in the cache, it will move the question from the original node position to the 
            # tail of the original linkedlist
            if key in self.key_to_prev:
                self.kick(self.key_to_prev[key])
                self.key_to_prev[key].next.value = value
                return
            
            # If the question is not in cache, it will be inseretd to the linkedlist
            self.push_back(LinkedNode(key, value))
            # In addition, if the cache reached its capacity, it will invaliaded the lease recently used question 
            # before inserting the new question 
            if len(self.key_to_prev) > self.capacity:
                self.pop_front()     

## Apply LRU Cache algorithm to cache of questions asked in this project

In [23]:
if __name__ == "__main__":
    # set the size of the cache be 20 questions
    LRU_Cache = LRUCache(20)

    # put several common questions into cache firstly
    for question in common_questions:
        LRU_Cache.set(question, common_questions[question])

In [24]:
LRU_Cache.get("Number of tweets in database?")

19171

In [25]:
LRU_Cache.get("What is the content of the newest tweet?")

'RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…'

In [26]:
LRU_Cache.get("What is LRU Cache?")

-1

# Search Application

## Initial attempt: Design a basic serach application UI with the questions and answers based on common_questions

In [27]:
# import some packages for user interface design 
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

# define a function f to return the answer to the input question 
def f(Question):
    print(common_questions[Question])

In [28]:
# interface for user to do question search 
def search_application():
    interact(f, Question = [x for x in common_questions])
search_application()
# import timeit
# print(timeit.timeit(search_application, number=1))

interactive(children=(Dropdown(description='Question', options=('Number of tweets in database?', 'What is the …

In [29]:
# w = widgets.Dropdown(
#     options = common_questions,
#     description = 'Please pick a question:', 
#     style = style, 
#     layout = {'width': 'max-content'})
# display(w)

## Check running time of the common questions for search application

In [30]:
def common_question_in_cache():
    print(common_questions["What is the content of the newest tweet?"])

import timeit
print(timeit.timeit(common_question_in_cache, number=1))

RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…
5.926200000061499e-05


In [31]:
def common_question_in_cache():
    print(common_questions["Number of tweets in database?"])

import timeit
print(timeit.timeit(common_question_in_cache, number=1))

19171
4.860399999984111e-05


## Improvement on Search Application based on LRU Cache and More Questions 

In [32]:
import pandas as pd
import numpy as np

In [45]:
# NewTweet is to find the created_time, user_id, user_name and content of newest tweets
class NewTweet:
    
    def __init__(self, number):
        self.number = number
    
    def find_tweet_content(self):
        newest_tweet = db.first_100_tweets_table.find().limit(self.number)
        index = 1
        result = []
        for doc in newest_tweet:
            temp = []
            content = doc['text']
            created_time = doc['created_at']
            user_id = doc['user']['id']
            user_name = doc['user']['name']
            temp.append("No. " + str(index))
            temp.append(created_time + " User ID: " + str(user_id) + " User Name: " + user_name)
            temp.append(content_of_newest_tweet)
            result.append(temp)
            index += 1
        return result

# FamousUsers is to find the user_id, user_name and number_of_followers of users with most followers
class FamousUsers:
    def __init__(self, number):
        self.number = number
    
    def find_famous_users(self):
        # find the user id of the user who has the largest number of followers in this database
        user_with_largest_num_of_followers = db.twitter_collection.find({}, {"user.id", "user.name", "user.followers_count"}).sort("user.followers_count", -1).limit(self.number)
        index = 1
        result = []
        for doc in user_with_largest_num_of_followers:
            temp = []
            user_id = doc['user']['id']
            user_name = doc['user']['name']
            number_of_followers = doc['user']['followers_count']
            temp.append("No. " + str(index))
            temp.append("User ID: " + str(user_id) + "\nUser Name: " + user_name + "\nFollowers: " + str(number_of_followers))
            result.append(temp)
            index += 1
        return result

            
from collections import Counter
# PopularWords is to find words which have been used most in this dataset
class PopularWords:
    def __init__(self, number):
        self.number = number
    
    def find_popular_words(self):
        sentences = db.twitter_collection.find({}, {"text"})
        word_dictionary = {}
        for doc in sentences:
            sentence = doc['text']
            for word in sentence.split():
                if word in word_dictionary:
                    word_dictionary[word] += 1
                else:
                    word_dictionary[word] = 1
        k = Counter(word_dictionary)
        high = k.most_common(self.number)
        index = 1
        result = []
        for word in high:
            temp = []
            temp.append("No. " + str(index))
            temp.append(word[0] + ": " + str(word[1]))
            result.append(temp)
            index += 1
        return result

# TweetsFromFamousUsers is to find tweets from people with the most followers
class TweetsFromFamousUsers:
    def __init__(self, number):
        self.number = number
        
    def find_tweets_from_famous_users(self):
        most_followers = db.twitter_collection.find({}, {"user.id", "user.followers_count", "text"}).sort("user.followers_count", -1).limit(self.number)
        index = 1
        result = []
        for doc in most_followers:
            user_id = doc['user']['id']
            follower_count = doc['user']['followers_count']
            tweet = doc['text']
            tweets_from_famous_users = "User ID: " + str(user_id) + "\nNumber of Followers: " + str(follower_count) + "\nTweet: " + tweet
            temp = []
            temp.append("No. " + str(index))
            temp.append(tweets_from_famous_users)
            result.append(temp)
            index += 1
        return result

# ValidQuestion is to find the question to its related query 
class ValidQuestion:
    
    def __init__(self, question, number):
        self.question = question
        self.number = number
    
    def get_question(self):
        if self.question == "Find Newest Tweets":
            new_tweet = NewTweet(self.number)
            return new_tweet.find_tweet_content()
        
        elif self.question == "Find Famous Users":
            famous_users = FamousUsers(self.number)
            return famous_users.find_famous_users()

        elif self.question == "Find popular words":
            popular_words = PopularWords(self.number)
            return popular_words.find_popular_words()
        
        elif self.question == "Find tweets from famous users":
            tweets_from_famous_users = TweetsFromFamousUsers(self.number)
            return tweets_from_famous_users.find_tweets_from_famous_users()
        
        else:
            print("We don't have this question in search application!")
            return

In [46]:
import ipywidgets as widgets
dropdown_count = widgets.Dropdown(options = [x for x in range(100)])

In [47]:
def dropdown_count(change):
    valid_question 
    display(df_london[df_london.year == change.new])

In [48]:
valid_question = ValidQuestion("Find Newest Tweets", 5)
valid_question.get_question()

[['No. 1',
  'Wed Apr 15 00:56:34 +0000 2020 User ID: 22091137 User Name: Basu Ghosh Das',
  'RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…'],
 ['No. 2',
  'Wed Apr 15 00:56:34 +0000 2020 User ID: 531629036 User Name: Creeds Cannon',
  'RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…'],
 ['No. 3',
  'Wed Apr 15 00:56:34 +0000 2020 User ID: 1042203452212948992 User Name: christy��️\u200d��',
  'RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…'],
 ['No. 4',
  'Wed Apr 15 00:56:34 +0000 2020 User ID: 3551287573 User Name: david lee',
  'RT @TarekFatah: Pakistanis in Karachi defying orders not to congregate in mosques by creating makeshift mosques on rooftops. Working hard t…'],
 ['No. 5',
  'Wed Apr 15 00:56:33 +0

In [49]:
# from IPython.display import display
class SearchApplicationOne:
    def __init__(self, capacity):
        self.LRU_Cache = LRUCache(capacity)
    
    def user_interface(self):
        @interact(options = widgets.Dropdown(options=['Find Newest Tweets', 'Find Famous Users', 'Find popular words', 
                                              'Find tweets from famous users'],
            value='Find Newest Tweets',
            description='Question:',
            disabled=False), 
            Top = widgets.IntSlider(min = 1, max = 100, step = 1, description = "Top: ", value = 1))

        def f(options, Top):
            if self.LRU_Cache.get((options, Top)) != -1:
                answers = self.LRU_Cache.get((options, Top))
                for answer in answers:
                    for val in answer:
                        print(val)
                    print()
            else:
                valid_question = ValidQuestion(options, Top)
                answers = valid_question.get_question()
                self.LRU_Cache.set((options, Top), answers)
                for answer in answers:
                    for val in answer:
                        print(val)
                    print()
            print(self.LRU_Cache.key_to_prev)

In [50]:
serach_appliaction = SearchApplicationOne(5)
serach_appliaction.user_interface()

interactive(children=(Dropdown(description='Question:', options=('Find Newest Tweets', 'Find Famous Users', 'F…