## Import Libraries

In [1]:
import os
import re
import glob
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read Config File

In [2]:
import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
db_name = config['DEFAULT']['DB-Name']
contain_string = config['DEFAULT']['Contain-String']
output_path = config['DEFAULT']['Output-Path']

## Connect MongoDB

In [3]:
from pymongo import MongoClient
client = MongoClient(ip, int(port))

## Get Collection Name

In [4]:
# connect to database
db = client[db_name]
collections_twitter = db.collection_names()

In [5]:
dic_collection = {}
for i in collections_twitter:
    if contain_string in i:
        dic_collection[i] = "{:}".format(db[i].find({}).count())

## Pipeline

In [6]:
# pipeline for aggregation
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": {
            "hashtags": "$entities.hashtags",
            "date": {"$substr": [ "$created_at", 4, 6 ]},
            "geoname": "$geoname"
        },
        "count": { "$sum": 1 }
        }
    }
]

## Supporting Functions

In [7]:
# check if stringis English
def isEnglish(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [8]:
# create foler if not exist
def create_folder(output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

In [9]:
# delete existed collection from the list dic_collection
def delete_collection(output_path,dic_collection):
    for input_file in glob.glob(os.path.join(output_path,'*.csv')):
        collection_name = re.search(output_path+'(.+?).csv', input_file).group(1)
        if collection_name in dic_collection:
            print("Existed collection: " + collection_name)
            del dic_collection[collection_name]
    return dic_collection

In [10]:
def check_exist(data_format,h,date_year,city,state,country,exist,dic,count):
    for d in data_format:
        if (h in d["hashtag"]) and (date_year in d["date"]) and (city in d["city"]) and (state in d["state"]) \
        and (country in d["country"]):
            d["count"] += count     
            exist = 1
    if exist == 0:
        data_format.append(dic)
    return data_format

In [11]:
# count hashtag daily for each collection
def count_hashtag(length,hashtags,date_year,city,state,country,count,data_format,num_delete):
    for i in range(0,length):
        exist = 0
        # get hashtag
        h = hashtags[i]["text"].lower()
        # check if it is in English  
        if isEnglish(h):
            dic = {"hashtag": h, "date":date_year, "count":count, "city":city, "state":state, "country":country}
            if len(data_format) > 0 :
                #check if the record exists
                data_format = check_exist(data_format,h,date_year,city,state,country,exist,dic,count)
            else:        
                data_format.append(dic)
        else:
            num_delete += 1
    return data_format, num_delete

In [12]:
# create data list
def create_list(data_list,data_format,year):
    num_delete = 0
    for data in data_list:
        hashtags = data["_id"]["hashtags"]
        date_year = data["_id"]["date"] + " " + year
        if "geoname" in data["_id"]:
            city = data["_id"]["geoname"]["city"]
            state = data["_id"]["geoname"]["state"]
            country = data["_id"]["geoname"]["country"]
        else:
            city = ''
            state = ''
            country = ''
            
        count = data["count"]        
        length = len(hashtags)
        
        data_format,num_delete = count_hashtag(length,hashtags,date_year,city,state,country,count,data_format,num_delete)
    return data_format,num_delete

In [13]:
# write csv file
def write_csv(collection,data_format,output_path,num_delete,total_tweet_count):
    file_name = output_path + collection + ".csv"
    with open(file_name, 'w') as f:
        # header
        f.write('hashtag,date,hashtag_count,city,state,country,non_english_hashtag_count,total_tweet_count\n')
        for data in data_format:
            line = "{},{},{},{},{},{},{},{}\n".format(data['hashtag'],data['date'],data["count"],data["city"] \
                                                      ,data["state"],data["country"],num_delete,total_tweet_count)
            f.write(line)

## Count the number of hashtag daily

In [14]:
#create folder if not exist
create_folder(output_path)

# delete existed collection from the list dic_collection
dic_collection = delete_collection(output_path,dic_collection)

for collection in sorted(dic_collection):
    print("-----------------------\n")
    print("Processing on collection: " + collection)
    
    data_format = []
    num_delete = 0
    #total_tweet_count = dic_collection[collection]
    total_tweet_count = 111
    data_list = list(db[collection].aggregate(pipeline,allowDiskUse=True))
    year = collection[:4]
    
    if len(data_list) > 0:
        data_format, num_delete = create_list(data_list,data_format,year)
                    
    print("hashtag and date list is finished")
    print(str(num_delete) + " non-English hashtags have been deleted.")
    
    write_csv(collection,data_format,output_path,num_delete,total_tweet_count)

    print ("hashtag csv file for collection " + collection + ' is finished.')
    print("-----------------------\n")

-----------------------

Processing on collection: 2018_W52_Twitter_Australia
hashtag and date list is finished
194 non-English hashtags have been deleted.
hashtag csv file for collection 2018_W52_Twitter_Australia is finished.
-----------------------

