## Import Libraries

In [1]:
import csv
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read Config File

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
db_name = config['DEFAULT']['DB-Name']

## Connect MongoDB

In [None]:
from pymongo import MongoClient
client = MongoClient(ip, int(port))

In [2]:
db_twitter = client[db_name]
collections_twitter = db_twitter.collection_names()

## Collection : Number of records

In [3]:
dic_collection = {}
for i in collections_twitter:
    dic_collection[i] = "{:,}".format(db_twitter[i].find({}).count())
    
for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

Twitter_2017: 25,153
Twitter_2018: 25,456
Twitter_2019: 7,997


## Pipeline

In [4]:
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": "$entities.hashtags",
        "count": { "$sum": 1 },
        }
    }
]

## Supporting Functions

In [5]:
def get_dic(dic_hashtag, data, h, i):
    if len(dic_hashtag)>0:
        if h in dic_hashtag:
            dic_hashtag[h] += data[i]["count"]
        else:
            dic_hashtag[h] = data[i]["count"]
    else:
        dic_hashtag[h] = data[i]["count"]
    return dic_hashtag

In [6]:
def write_csv(output_file, top_100_htag):
    csv_columns = ['hashtag','count']
    with open(output_file, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=csv_columns)
        writer.writeheader()
        for key in top_100_htag.keys():
            f.write("%s,%s\n"%(key,top_100_htag[key]))
    print(output_file + " is ready.")

## Get Top 100 Hashtags CSV File Per Collection

In [7]:
for collection in sorted(dic_collection):
    
    print("-------------------")
    print("Processing on collection: " + collection)
    
    # get hashtag list
    dic_hashtag={}
    data = list(db_twitter[collection].aggregate(pipeline,allowDiskUse=True))
    if len(data) > 0 : 
        for i in range(len(data)):
            for j in data[i]["_id"]:
                h = j["text"]
                if(re.match("^[a-zA-Z0-9]*$",h)):
                    dic_hashtag = get_dic(dic_hashtag, data, h, i)
    print("hashtag dictionary for collection " + collection + " is finished")
    
    # get top 100 hashtags
    top_100_htag = dict(sorted(dic_hashtag.items(), key=lambda x: x[1], reverse=True)[:100])
    
    # export to csv
    output_file = collection + ".csv"
    write_csv(output_file, top_100_htag)
    print("-------------------")

-------------------
Processing on collection: Twitter_2017
hashtag dictionary for collection Twitter_2017 is finished
Twitter_2017.csv is ready.
-------------------
-------------------
Processing on collection: Twitter_2018
hashtag dictionary for collection Twitter_2018 is finished
Twitter_2018.csv is ready.
-------------------
-------------------
Processing on collection: Twitter_2019
hashtag dictionary for collection Twitter_2019 is finished
Twitter_2019.csv is ready.
-------------------
