In [1]:
import os
import glob
import subprocess
import pandas as pd

In [2]:
DATA_ROOT = "../data/"
SAVE_ROOT = "../processed_data/"

In [3]:
def get_raw_list():
    paths = []
    file_path = SAVE_ROOT + "list_category_8.txt"
    with open(file_path, 'r') as f:
        paths = f.readlines()
    return paths

In [4]:
list_category = get_raw_list()

In [5]:
import pycountry_convert as pc

def get_continent(country):
    
    continents = {
    'NA': 'North America',
    'EU': 'Europe',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    }
    
    try:
        if country == 'Taiwan':
            continent = 'Asia'
        else:
            country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
            continent = continents[pc.country_alpha2_to_continent_code(country_code)]
    except:
        continent = ' '
    return continent

In [6]:
# filename the input file
def read_file(filename):
    df_raw = pd.read_csv(filename)
    # Select column we want
    df = df_raw.loc[:,['stroke_count','countrycode','drawing_time_total']]
    # Change millisec to sec
    df['drawing_time_total'] = df['drawing_time_total'].div(1000)
    # Change name of column
    df.columns = ['stroke_count','country','drawing_time']
    # Add count per country
    df['count'] = 1
    # Round number
    df = df.groupby('country').agg({'stroke_count': 'mean', 'drawing_time':'mean', 'count':'sum'}).round({'stroke_count':1,'drawing_time':2})
    # Reset index to column for csv file
    df.reset_index(level=0, inplace=True)
    # Count number of occurences by country
    df['word'] = df_raw['word'][0]
    # Get continent 
    df['continent'] = df['country'].apply(get_continent)
    # Get rid of unidentified country
#     df = df[df['continent'] != ' ']

    return df

In [7]:
frames = []
for path in list_category:
    raw_id = 'time_per_' + path.split('/')[-1].strip().replace(' ', '_') + '_stats.csv'
    local_path = os.path.join(DATA_ROOT, raw_id)
    df = read_file(local_path)
    frames.append(df)

In [8]:
df_concat = pd.concat(frames, axis=0).reset_index(drop=True)

In [9]:
df_concat

Unnamed: 0,country,stroke_count,drawing_time,count,word,continent
0,AN,1.0,1.26,1,circle,
1,Albania,1.4,2.48,34,circle,Europe
2,Algeria,1.5,2.42,165,circle,Africa
3,Angola,1.0,0.96,1,circle,Africa
4,Anguilla,1.0,1.65,1,circle,North America
5,Antigua and Barbuda,1.0,0.74,1,circle,North America
6,Argentina,1.2,2.36,278,circle,South America
7,Armenia,1.8,3.03,39,circle,Asia
8,Aruba,1.2,1.86,4,circle,North America
9,Australia,1.3,1.93,2752,circle,Australia


In [10]:
df_concat.to_csv(SAVE_ROOT + 'time_and_stroke_continent.csv', index=False)