## Import Libraries

In [None]:
from multiprocessing import Pool
from functools import partial
import pandas as pd
import os
import re
import glob
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read Config File

In [None]:
import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
db_name = config['DEFAULT']['DB-Name']
contain_string = config['DEFAULT']['Contain-String']
output_path = config['DEFAULT']['Output-Path']
cpu_number = config['DEFAULT']['CPU-Number']

## Connect MongoDB

In [None]:
from pymongo import MongoClient
client = MongoClient(ip, int(port))

## Get Collection Name

In [None]:
# connect to database
db = client[db_name]
collections_twitter = db.collection_names()

In [None]:
dic_collection = {}
for i in collections_twitter:
    if contain_string in i:
        dic_collection[i] = "{:}".format(db[i].find({}).count())

## Pipeline

In [None]:
# pipeline for aggregation
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": {
            "hashtags": "$entities.hashtags",
            "date": {"$substr": [ "$created_at", 4, 6 ]},
            "geoname": "$geoname"
        },
        "count": { "$sum": 1 }
        }
    }
]

## Supporting Functions

In [None]:
# check if stringis English
def isEnglish(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [None]:
# create foler if not exist
def create_folder(output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

In [None]:
# delete existed collection from the list dic_collection
def delete_collection(output_path,dic_collection):
    for input_file in glob.glob(os.path.join(output_path,'*.csv')):
        collection_name = re.search(output_path+'(.+?).csv', input_file).group(1)
        if collection_name in dic_collection:
            print("Existed collection: " + collection_name)
            del dic_collection[collection_name]
    return dic_collection

In [None]:
# create data list
def create_list(data,year):
    hashtags = data["_id"]["hashtags"]
    date_year = data["_id"]["date"] + " " + year
    if "geoname" in data["_id"]:
        city = data["_id"]["geoname"]["city"]
        state = data["_id"]["geoname"]["state"]
        country = data["_id"]["geoname"]["country"]
    else:
        city = ''
        state = ''
        country = ''

    count = data["count"]        
    length = len(hashtags)
    data_format=[]
    num_delete = 0
    for i in range(0,length):
        exist = 0
        # get hashtag
        h = hashtags[i]["text"].lower()
        # check if it is in English
        if isEnglish(h):
            hashtag_list = [h,date_year,count,city,state,country]
            data_format.append(hashtag_list)
        else:
            num_delete += 1
    return data_format,num_delete

## Count the number of hashtag daily

In [None]:
#create folder if not exist
create_folder(output_path)

# delete existed collection from the list dic_collection
dic_collection = delete_collection(output_path,dic_collection)

for collection in sorted(dic_collection):
    print("-----------------------\n")
    print("Processing on collection: " + collection)
    start = time.time()
    
    data_format = []
    num_delete = []
    total_tweet_count = dic_collection[collection]
    data_list = list(db[collection].aggregate(pipeline,allowDiskUse=True))
    
    y = collection[:4]
    y_week = "_".join(collection.split("_", 2)[:2])
    week = y_week.split("_W")[1]
    
    if len(data_list) > 0:
        print("Datalist length",len(data_list))
        print("Number of using CPU: " + cpu_number)
        pool = Pool(processes=int(cpu_number))   
        creat_l=partial(create_list,year = y)
        data_format,num_delete = zip(*pool.imap(creat_l,data_list, chunksize=200000))
    
    # format the list from nested list
    data_result = []
    for data in data_format:
        for i in data:
            data_result.append(i)
                     
    print("list is finished")
    print(str(sum(num_delete)) + " non-English hashtags have been deleted.")
    
    column_name = ['hashtag','date','hashtag_count','city','state','country'] 
    df = pd.DataFrame(data_result,columns=column_name)
    
    # convert the data typr to string to avoid lossing data after group
    df['hashtag'] = df.hashtag.astype(str)
    df['date'] = df.date.astype(str)
    df['city'] = df.city.astype(str)
    df['state'] = df.state.astype(str)
    df['country'] = df.country.astype(str)
    df['hashtag_count'] = df.hashtag_count.astype(int)

    # group the same row and sum it
    group_df = df.groupby(['hashtag','date','city','state','country'])['hashtag_count'].sum().reset_index()
    group_df["non_english_hashtag_count"] = sum(num_delete)
    group_df["total_tweet_count"] = total_tweet_count
    group_df["week_number"] = week
    
    group_df.to_csv("{}{}.csv".format(output_path,collection), sep=',',index = False, encoding='UTF-8')
    
    print ("csv file for collection " + collection + ' is finished.')
    end = time.time()
    print("Time used: " + str(end-start))
    print("-----------------------\n")