In [1]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [39]:
import json
import pprint
from datetime import datetime
from pymongo import MongoClient

In [40]:
#MONGODB_URL="mongodb+srv://ss4767:JadonSancho%407@cluster0.r6mgzuu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
MONGODB_URL="mongodb://localhost:27017"
client=MongoClient(MONGODB_URL)
tweets_collection= client.dbms_project.tweets

In [41]:
#Clear the entire collection
tweets_collection.delete_many({})
print("Collection cleared. Ready to start anew.")

Collection cleared. Ready to start anew.


In [42]:
#so that duplicate documents dont get inserted(raises an error)
tweets_collection.create_index("tweet_id", unique=True)

'tweet_id_1'

In [43]:
#we are reading the contents of the original tweet if the tweet is retweeted 
#for access to extended tweeet if available
def get_tweet_text(data):
    # Check if the tweet is a retweet based on the text content
    if data['text'].startswith('RT'):
        # Retrieve the original tweet's data from the retweeted_status, if available
        retweet = data.get('retweeted_status', {})
        if 'extended_tweet' in retweet:
            # Use full_text from extended_tweet if available
            return retweet['extended_tweet']['full_text']
        else:
            # Use text from retweeted_status if extended_tweet is not available
            return retweet.get('text', data['text'])
    else:
        # For a non-retweet, check if it's an extended tweet
        if 'extended_tweet' in data:
            # Use full_text from extended_tweet if available
            return data['extended_tweet']['full_text']
        else:
            # Use standard text field if it's not an extended tweet
            return data['text']


In [44]:
def get_hashtags(data):
    # Check if the tweet is a retweet and extract hashtags accordingly
    if data['text'].startswith('RT') and 'retweeted_status' in data:
        retweet = data['retweeted_status']
        hashtags_list = retweet.get('extended_tweet', {}).get('entities', {}).get('hashtags', retweet.get('entities', {}).get('hashtags', []))
    else:
        hashtags_list = data.get('extended_tweet', {}).get('entities', {}).get('hashtags', data.get('entities', {}).get('hashtags', []))
    return [hashtag['text'] for hashtag in hashtags_list]

In [45]:
def read_and_insert(file_name):
    with open(file_name, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                if tweets_collection.count_documents({"tweet_id": data["id"]}) == 0:
                    tweet_text = get_tweet_text(data)
                    hashtags = get_hashtags(data)

                    tweet_document = {
                        "tweet_id": data["id"],
                        "text": tweet_text,
                        "hashtags": hashtags,
                        "user": {
                            "user_id": data['user']['id'],
                            "name": data['user']['name'],
                            "screen_name": data['user']['screen_name']
                        },
                        "created_at": parse_date(data['created_at'])
                    }

                    tweets_collection.insert_one(tweet_document)
            except (json.JSONDecodeError, KeyError):
                continue  # Skip invalid or incomplete lines


In [46]:
def parse_date(date_str):
    return datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')

# File paths
file_1 = '../data/corona-out-2'
file_2 = '../data/corona-out-3'

# Process each file
read_and_insert(file_1)
read_and_insert(file_2)

print("Documents inserted")

Documents inserted


In [48]:
tweets_collection.create_index([("text", "text")])
tweets_collection.create_index([("hashtags", 1)])
tweets_collection.create_index([("user.screen_name", 1)])



'user.screen_name_1'

In [49]:
query = "holiday"
results = tweets_collection.find({"$text": {"$search": query}})
for result in results:
    print(result)

{'_id': ObjectId('66256f62ecffc84c2647253d'), 'tweet_id': 1254027543868510208, 'text': 'Corona declared 2020 a public holiday.', 'hashtags': [], 'user': {'user_id': 198743397, 'name': 'Wallflower (@ 🏡)', 'screen_name': 'TheAaronAlasa'}, 'created_at': datetime.datetime(2020, 4, 25, 12, 40, 39)}
{'_id': ObjectId('66256f6eecffc84c2647f017'), 'tweet_id': 1254046481507422210, 'text': '"Corona\'s holiday homework" is gotta do now, but I\'m not motivated', 'hashtags': [], 'user': {'user_id': 1252284793066143744, 'name': 'sii', 'screen_name': 'sii1313131313E'}, 'created_at': datetime.datetime(2020, 4, 25, 13, 55, 54)}
{'_id': ObjectId('66256f66ecffc84c2647714a'), 'tweet_id': 1254034781517557760, 'text': 'Looks like we r off to Poland for our summer holidays @b_wisey82', 'hashtags': [], 'user': {'user_id': 105126289, 'name': 'Paul Ley', 'screen_name': 'paul_ley'}, 'created_at': datetime.datetime(2020, 4, 25, 13, 9, 25)}
{'_id': ObjectId('66256f5eecffc84c2646e75d'), 'tweet_id': 12494077923913031

In [50]:
query = "COVID19InTurkeysPrisons"
results = tweets_collection.find({ "hashtags": query });

for result in results:
    print(result)

{'_id': ObjectId('66256f5aecffc84c2646ae55'), 'tweet_id': 1249403768023678982, 'text': 'In Turkey, there are 300 thousand prisoners and 150 thousand prison employees in prisons. \nPrisons are the most risky places in case of corona virus.\nThe lives of thousands of people are at risk.\n\n#COVID19InTurkeysPrisons\n\n@SusannaCeccardi', 'hashtags': ['COVID19InTurkeysPrisons'], 'user': {'user_id': 1225145123920588805, 'name': 'efe09', 'screen_name': 'efe0927183508'}, 'created_at': datetime.datetime(2020, 4, 12, 18, 27, 25)}
{'_id': ObjectId('66256f5aecffc84c2646ae57'), 'tweet_id': 1249403769567227906, 'text': 'Turkey is so stubborn to change their mind, they want innocent babies and their mothers, journalists, lawyers, doctors, human rights activists to die! Corona virus is spreading rapidly in Turkish prisons. They need to be released immediately! @hrw \n\n#COVID19InTurkeysPrisons', 'hashtags': ['COVID19InTurkeysPrisons'], 'user': {'user_id': 1230170166614482944, 'name': 'Carpe diem', 'sc

In [51]:
user_id = "MetinKa49194941"  # Example user_id
tweets_by_user = list(tweets_collection.find({'user.screen_name': user_id}))
print("Tweets by user:", tweets_by_user)

Tweets by user: [{'_id': ObjectId('66256f5becffc84c2646b97c'), 'tweet_id': 1249404589624958979, 'text': 'Waiting for the evacuation, there are\n\n✅780 babies\n✅2,500 children\n✅Thousands of political prisoners\n✅1,333 Patients\n✅457 Severe patients\n\nLet these people be evacuated before they are infected by corona virus.\n\n#COVID19InTurkeysPrisons https://t.co/27PF0A1NZR', 'hashtags': ['COVID19InTurkeysPrisons'], 'user': {'user_id': 1245408169456807936, 'name': 'Metin Kara', 'screen_name': 'MetinKa49194941'}, 'created_at': datetime.datetime(2020, 4, 12, 18, 30, 41)}, {'_id': ObjectId('66256f5becffc84c2646ba2e'), 'tweet_id': 1249404645258256387, 'text': 'Sanchez-Amor EP Rapporteur for Turkey:\n\'\' Due to the corona virus,the Turkish government\'s plan to bring amnesty is positive.\n\nIt\'s time to release hundreds of journalists, academics, lawyers and human rights defenders who have been unjustly arrested. "\n\n#COVID19InTurkeysPrisons', 'hashtags': ['COVID19InTurkeysPrisons'], 'use