# Import Libraries & Connect MongoDB

In [1]:
import glob
import configparser
import os
import csv
import time
import datetime
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
output_file_aus = config['DEFAULT']['Output-File-Aus']
output_file_world = config['DEFAULT']['Output-File-World']

from pymongo import MongoClient
client = MongoClient(ip, int(port))

In [2]:
db_twitter = client["Twitter"]
collections_twitter = db_twitter.collection_names()

# Current year and week

In [3]:
current_timestamp = int(time.time() * 1000)
current_year = int(datetime.datetime.now().year)
print("current year : " + str(current_year))

current_week = int((current_timestamp - 1546214400000)/1000/604800)+1
print("current week : " + str(current_week))

current year : 2019
current week : 23


# Collection : Number of records

In [4]:
dic_collection = {}
for i in collections_twitter:
    if i.startswith("20") and "Australia" in i:
        year = i[0:4]
        week = re.search('_(.+?)_', i).group(1)[1:]
        if int(year) < current_year:
            dic_collection[i] = "{:,}".format(db_twitter[i].find({}).count())
        else:
            try:
                if int(week) < current_week:
                    dic_collection[i] = "{:,}".format(db_twitter[i].find({}).count())
            except: pass

for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

2018_W52_Twitter_Australia: 38,065
2019_W1_Twitter_Australia: 40,880
2019_W20_Twitter_Australia: 1
2019_W2_Twitter_Australia: 37,645
2019_W4_Twitter_Australia: 59,625
2019_W5_Twitter_Australia: 61,617
2019_W6_Twitter_Australia: 61,435
2019_W7_Twitter_Australia: 28,017
2019_W8_Twitter_Australia: 60,035


# Pipeline

In [5]:
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": "$entities.hashtags",
        "count": { "$sum": 1 },
        }
    }
]

# Supporting Function

In [6]:
def get_dic(dic_hashtag, data, h, i):
    if len(dic_hashtag)>0:
        if h in dic_hashtag:
            dic_hashtag[h] += data[i]["count"]
        else:
            dic_hashtag[h] = data[i]["count"]
    else:
        dic_hashtag[h] = data[i]["count"]
    return dic_hashtag

In [7]:
def write_csv(output_file, top_100_htag):
    csv_columns = ['hashtag','count']
    with open(output_file, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=csv_columns)
        writer.writeheader()
        for key in top_100_htag.keys():
            f.write("%s,%s\n"%(key,top_100_htag[key]))
    print(output_file + " is ready.")

# Producing the hashtag list with count

In [8]:
dic_hashtag_aus={}
dic_hashtag_world={}
for collection in sorted(dic_collection):
    print("-------------------")
    print("Processing on collection: " + collection)
    data = list(db_twitter[collection].aggregate(pipeline,allowDiskUse=True))
    if len(data) > 0 : 
        for i in range(len(data)):
            for j in data[i]["_id"]:
                h = j["text"]
                if(re.match("^[a-zA-Z0-9]*$",h)):
                    if "Australia" in collection:
                        dic_hashtag_aus = get_dic(dic_hashtag_aus, data, h, i)
                    elif "Other" in collection:
                        dic_hashtag_world = get_dic(dic_hashtag_world, data, h, i)
    print("hashtag dictionary for collection " + collection + " is finished")
    print("-------------------")

-------------------
Processing on collection: 2018_W52_Twitter_Australia
hashtag dictionary for collection 2018_W52_Twitter_Australia is finished
-------------------
-------------------
Processing on collection: 2019_W1_Twitter_Australia
hashtag dictionary for collection 2019_W1_Twitter_Australia is finished
-------------------
-------------------
Processing on collection: 2019_W20_Twitter_Australia
hashtag dictionary for collection 2019_W20_Twitter_Australia is finished
-------------------
-------------------
Processing on collection: 2019_W2_Twitter_Australia
hashtag dictionary for collection 2019_W2_Twitter_Australia is finished
-------------------
-------------------
Processing on collection: 2019_W4_Twitter_Australia
hashtag dictionary for collection 2019_W4_Twitter_Australia is finished
-------------------
-------------------
Processing on collection: 2019_W5_Twitter_Australia
hashtag dictionary for collection 2019_W5_Twitter_Australia is finished
-------------------
------------

# Get Top 100 Hashtags

In [9]:
aus_top_100_htag = dict(sorted(dic_hashtag_aus.items(), key=lambda x: x[1], reverse=True)[:100])
world_top_100_htag = dict(sorted(dic_hashtag_world.items(), key=lambda x: x[1], reverse=True)[:100])
print("Top 100 hashtags dictionary is ready")

Top 100 hashtags dictionary is ready


# Export CSV File

In [10]:
write_csv(output_file_aus, aus_top_100_htag)
write_csv(output_file_world, world_top_100_htag)

aus_top_100_hashtag.csv is ready.
world_top_100_hashtag.csv is ready.
