## Import Libraries

In [1]:
import os
import glob
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read Config File

In [None]:
import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
db_name = config['DEFAULT']['DB-Name']
contain_string = config['DEFAULT']['Contain-String']
output_path = config['DEFAULT']['Output-Path']

## Connect MongoDB

In [None]:
from pymongo import MongoClient
client = MongoClient(ip, int(port))

## Get Collection Name

In [2]:
# connect to database
db_twitter = client[db_name]
collections_twitter = db_twitter.collection_names()

In [3]:
dic_collection = {}
for i in collections_twitter:
    if contain_string in i:
        dic_collection[i] = "{:}".format(db_twitter[i].find({}).count())

for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

Twitter_2014: 3811
Twitter_2015: 19659
Twitter_2016: 25764
Twitter_2017: 25153
Twitter_2018: 25456
Twitter_2019: 7997


## Pipeline

In [4]:
# pipeline for aggregation
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": {
            "hashtags": "$entities.hashtags",
            "date": {"$substr": [ "$created_at", 4, 6 ]}
        },
        "count": { "$sum": 1 },
        }
    }
]

## Supporting Functions

In [5]:
# create foler if not exist
def create_folder(output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

In [6]:
# delete existed collection from the list dic_collection
def delete_collection(output_path,dic_collection):
    for input_file in glob.glob(os.path.join(output_path,'*.csv')):
        collection_name = re.search(output_path+'(.+?).csv', input_file).group(1)
        if collection_name in dic_collection:
            print("Existed collection: " + collection_name)
            del dic_collection[collection_name]
    return dic_collection

In [7]:
def check_exist(data_format,h,date_year,exist,dic,count):
    for d in data_format:
        if (h in d["hashtag"]) and (date_year in d["date"]):
            d["count"] += count
            exist = 1
    if exist == 0:
        data_format.append(dic)
    return data_format

In [8]:
# count hashtag daily for each collection
def count_hashtag(length,hashtags,date_year,count,data_format):
    num_delete = 0
    for i in range(0,length-1):
        exist = 0
        # get hashtag
        h = hashtags[i]["text"].lower()
        # check if it is in English
        if(re.match("^[a-zA-Z0-9]*$",h)):
            dic = {"hashtag": h, "date":date_year, "count" : count}
            if len(data_format) > 0 :
                #check if the record exists
                data_format = check_exist(data_format,h,date_year,exist,dic,count)
            else:        
                data_format.append(dic)
        else:
            num_delete += 1
    return data_format, num_delete

In [9]:
# create data list
def create_list(data_list,data_format,year):
    for data in data_list:
        hashtags = data["_id"]["hashtags"]
        date_year = data["_id"]["date"] + " " + year
        count = data["count"]
        length = len(hashtags)
        data_format,num_delete = count_hashtag(length,hashtags,date_year,count,data_format)
    return data_format,num_delete

In [10]:
# write csv file
def write_csv(collection,data_format,output_path):
    file_name = output_path + collection + ".csv"
    with open(file_name, 'w') as f:
        # header
        f.write('hashtag,date,count\n')
        for data in data_format:
            line = data['hashtag'] + ',' + data['date'] + ',' + str(data["count"]) + '\n'
            f.write(line)

## Count the number of hashtag daily

In [11]:
#create folder if not exist
create_folder(output_path)

# delete existed collection from the list dic_collection
dic_collection = delete_collection(output_path,dic_collection)

for collection in sorted(dic_collection):
    print("-----------------------\n")
    print("Processing on collection: " + collection)
    
    data_format = []
    data_list = list(db_twitter[collection].aggregate(pipeline,allowDiskUse=True))
    year = collection[-4:]

    if len(data_list) > 0:
        data_format, num_delete = create_list(data_list,data_format,year)
                    
    print("hashtag and date list is finished")
    print(str(num_delete) + " non-English hashtags have been deleted.")
    
    write_csv(collection,data_format,output_path)

    print ("hashtag csv file for collection " + collection + ' is finished.')
    print("-----------------------\n")

Existed collection: Twitter_2017
Existed collection: Twitter_2016
Existed collection: Twitter_2014
Existed collection: Twitter_2015
Existed collection: Twitter_2018
Existed collection: Twitter_2019
