In [None]:
import pandas as pd
import numpy as np
import glob
import os
import re
import configparser 

config = configparser.ConfigParser()
config.read('config.ini')
input_folder = config['DEFAULT']['Input-Folder']
out_folder = config['DEFAULT']['output-Folder']
lan = config['DEFAULT']['Language']
delete_none = config['DEFAULT']['Delete-None']


In [None]:
# create folder if not exist
def create_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    return folder

In [None]:
# create a list of existed collection from the folder
def get_collection(folder):
    list_collection = []
    for input_file in glob.glob(os.path.join(folder,'*.csv')):
        collection_name = re.search('output/(.+?)_grouped', input_file).group(1)
        list_collection.append(collection_name)
    return list_collection

In [None]:
# filter rows with non-English word
def filter_row_lan(df): 
    # if language is English
    if lan == "1":   
        df = df[df["hashtag"].str.contains("^[a-zA-Z0-9]*$",na=False)]
        df = df[df["user_location"].str.contains("^[a-zA-Z0-9]*$",na=False)]
    # if there is no filter
    elif lan == "2":
        pass
    #if language is other than English
    elif lan == "3":
        df = df[~df["hashtag"].str.contains("^[a-zA-Z0-9]*$",na=False)] 
        df = df[~df["user_location"].str.contains("^[a-zA-Z0-9]*$",na=False)]
    return df

In [None]:
# filter rows with none value
def filter_row_none(df):
    # delete all rows with none value
    if delete_none == "1":
        df = df[df["hashtag"] != "None"]
        df = df[df["user_location"] != "None"]
    # do not delete rows with none value
    elif delete_none == "2":
        pass
    return df

In [None]:
# get folder for file_reader
input_path = create_folder(input_folder)
out_path = create_folder(out_folder)

#get lists of collection
list_collection = get_collection(out_path)

# read all csv files from the dictionary
for input_file in glob.glob(os.path.join(input_path,'*.csv')):
    
    print("-----------------")
    
    collection_name = re.search('output/(.+?)_hashtag', input_file).group(1)
    
    if collection_name in list_collection:
        print("existing file: " + input_file)
        
    else:
        print("processing on the file: " + input_file)
        
        # read csv file 
        df = pd.read_csv(input_file,',',lineterminator='\n')
        total_row = df.hashtag.count().item()
            
        # delete rows with non-english word
        df = filter_row_lan(df)
        row_fil_eng = df.hashtag.count().item()
            
        # delete rows with "none" value
        df = filter_row_none(df)
        row_fil_none = df.hashtag.count().item()
        
        del_row = total_row - row_fil_none
        
        print("Total number of rows: " + str(total_row))
        print("The number of rows deleted: " + str(del_row))
        

        # convert the data typr to string to avoid lossing data after group
        df['hashtag'] = df.hashtag.astype(str)
        df['user_location'] = df.user_location.astype(str)

        group_df = df.groupby(['hashtag','user_location']).size().reset_index(name='count')

        file_name = out_folder + collection_name + '_grouped.csv'
        group_df.to_csv(file_name, sep=',',index = False)

        print(file_name + " is done.")
    print("-----------------")