In [1]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import pprint
from datetime import datetime
from pymongo import MongoClient
from lrucache import LRUCache

In [3]:
MONGODB_URL="mongodb://localhost:27017"
client=MongoClient(MONGODB_URL)
tweets_collection= client.dbms_project.tweets

In [4]:
# Clear the entire collection
# tweets_collection.delete_many({})
# print("Collection cleared. Ready to start anew.")

In [5]:
#so that duplicate documents dont get inserted(raises an error)
tweets_collection.create_index("tweet_id", unique=True)

'tweet_id_1'

In [6]:
#we are reading the contents of the original tweet if the tweet is retweeted 
#for access to extended tweeet if available
def get_tweet_text(data):
    # Check if the tweet is a retweet based on the text content
    if data['text'].startswith('RT'):
        # Retrieve the original tweet's data from the retweeted_status, if available
        retweet = data.get('retweeted_status', {})
        if 'extended_tweet' in retweet:
            # Use full_text from extended_tweet if available
            return retweet['extended_tweet']['full_text']
        else:
            # Use text from retweeted_status if extended_tweet is not available
            return retweet.get('text', data['text'])
    else:
        # For a non-retweet, check if it's an extended tweet
        if 'extended_tweet' in data:
            # Use full_text from extended_tweet if available
            return data['extended_tweet']['full_text']
        else:
            # Use standard text field if it's not an extended tweet
            return data['text']


In [7]:
def get_hashtags(data):
    # Check if the tweet is a retweet and extract hashtags accordingly
    if data['text'].startswith('RT') and 'retweeted_status' in data:
        retweet = data['retweeted_status']
        hashtags_list = retweet.get('extended_tweet', {}).get('entities', {}).get('hashtags', retweet.get('entities', {}).get('hashtags', []))
    else:
        hashtags_list = data.get('extended_tweet', {}).get('entities', {}).get('hashtags', data.get('entities', {}).get('hashtags', []))
    return [hashtag['text'] for hashtag in hashtags_list]

In [8]:
def read_and_insert(file_name):
    with open(file_name, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                if tweets_collection.count_documents({"tweet_id": data["id"]}) == 0:
                    tweet_text = get_tweet_text(data)
                    hashtags = get_hashtags(data)

                    tweet_document = {
                        "tweet_id": data["id"],
                        "text": tweet_text,
                        "hashtags": hashtags,
                        "user": {
                            "user_id": data['user']['id'],
                            "name": data['user']['name'],
                            "screen_name": data['user']['screen_name']
                        },
                        "created_at": parse_date(data['created_at'])
                    }

                    tweets_collection.insert_one(tweet_document)
            except (json.JSONDecodeError, KeyError):
                continue  # Skip invalid or incomplete lines


In [10]:
def parse_date(date_str):
    return datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')

# File paths
file_1 = "C:/Users/lpnhu/Downloads/694-2024-team-13/data/corona-out-2"
file_2 = "C:/Users/lpnhu/Downloads/694-2024-team-13/data/corona-out-3"

# Process each file
read_and_insert(file_1)
read_and_insert(file_2)

print("Documents inserted")

Documents inserted


# Integrate cache with MongoDB 

In [11]:
tweets_collection = client.dbms_project.tweets

def fetch_tweet_from_mongodb(tweet_id):
    tweet = tweets_collection.find_one({"tweet_id": tweet_id})
    return tweet

cache = LRUCache(capacity=100, ttl=3600, persistence_path='cache.json')
cache.restore()

# Modify the get method in LRUCache to fetch data from MongoDB if not in cache
# Define the fallback function outside the LRUCache class
def get_with_mongo_fallback(self, key):
    data = LRUCache.get(self, key)  # Call the original get method
    if data is None:
        # If not in cache, fetch from MongoDB
        data = fetch_tweet_from_mongodb(key)
        if data is not None:
            # Update the cache with the fetched data
            self.put(key, data)
    return data

# Bind the new method to the cache instance, bypassing the overridden get method
import types
cache.get = types.MethodType(get_with_mongo_fallback, cache)

In [12]:
# Usage
tweet_id = '1254022772558368768'
tweet_data = cache.get(tweet_id)

# Set up timing and logging mechanism 

In [19]:
import time
import logging

# Set up basic logging to a file
logging.basicConfig(filename='cache_performance.log', level=logging.INFO)

def log_performance(start_time, end_time, operation, key, hit_or_miss):
    duration = end_time - start_time
    logging.info(f"{operation} took {duration:.6f} seconds")

# Modify cache methods to include timing

In [14]:
def get_with_mongo_fallback(self, key):
    start_time = time.time()
    data = LRUCache.get(self, key)  # Call the original get method of LRUCache
    if data is not None:
        # Cache hit, log the performance
        end_time = time.time()
        log_performance(start_time, end_time, "Cache hit", key)
    else:
        # Cache miss, fetch from MongoDB and then put it in the cache
        data = fetch_tweet_from_mongodb(key)
        if data is not None:
            self.put(key, data)
        end_time = time.time()
        log_performance(start_time, end_time, "MongoDB fetch", key)
    return data


In [16]:
# Replace the get method in the LRUCache instance
import types
cache = LRUCache(capacity=100, ttl=3600, persistence_path='cache.json')
cache.restore()


In [17]:
# Function to test the cache
def test_cache_performance(cache, test_keys):
    for key in test_keys:
        # First access will always be a miss since we're not assuming pre-loading
        start_time = time.time()
        data = cache.get(key)
        end_time = time.time()
        log_performance(start_time, end_time, "Access", key, "miss")

        # Subsequent accesses should be hits if the key is still in the cache
        for _ in range(3):  # Access the same key three times to test cache hits
            start_time = time.time()
            data = cache.get(key)
            end_time = time.time()
            log_performance(start_time, end_time, "Access", key, "hit")


In [20]:
# Example test_keys taken from the MongoDB 
test_keys = [
    '1249403767108668930', 
    '1249403768023678982', 
    '1249403769193779202'
]

test_cache_performance(cache, test_keys)