In [1]:
import os
import sys
import gc
import warnings
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from multiprocessing import Pool as ProcessPool

warnings.filterwarnings('ignore')

In [103]:
def weekList(record, mode='%A'): # %w
    weeks = ' '.join([datetime(int(s[:4]),int(s[4:6]),int(s[6:8])).strftime(mode)  for s in record])
    return weeks

def monthList(record):
    months = ' '.join([s[4:6] for s in record])
    return months

def hourList(record):
    hours = ' '.join([x[9:].replace('|',' ') for x in record])
    return hours

def load_test(file_name):
    df = pd.read_csv('./test_visit/test_visit/'+file_name, '\t', names=['user_id', 'record'])
    df['record_split'] = df.record.apply(lambda x: x.split(','))
    return df 
def load_train(file_name):
    df = pd.read_csv('./train_visit/'+file_name, '\t', names=['user_id', 'record'])
    df['record_split'] = df.record.apply(lambda x: x.split(','))
    return df 

In [104]:
def get_wmh(dataframe):
    ws =' '.join(dataframe.record_split.apply(weekList).tolist())
    ms =' '.join(dataframe.record_split.apply(monthList).tolist())
    hs = ' '.join(dataframe.record_split.apply(hourList).tolist())
    return ws, ms, hs

In [105]:
temp = pd.read_csv('./test_visit/test_visit/000000.txt','\t',names=['user_id','record'])
temp['record_split'] = temp.record.apply(lambda x: x.split(','))

In [106]:
%time a,b,c =get_wmh(temp)

CPU times: user 8.49 ms, sys: 0 ns, total: 8.49 ms
Wall time: 8.27 ms


In [39]:
#%time train_visit = visit_preprocess('./train_visit/', 'train', 20)  # 
#%time test_visit = visit_preprocess('./test_visit/test_visit/', 'test', 40)

In [110]:
def visit_preprocess(path, trainortest='train'):
    sample_names = sorted(os.listdir(path))
    if trainortest == 'train':
        ids = [x.split('.')[0].split('_')[0] for x in sample_names]
        labels = [int(x.split('.')[0].split('_')[1]) for x in sample_names]
    if trainortest == 'test':
        ids = [x.split('.')[0] for x in sample_names]
    df = pd.DataFrame({'AreaID':ids})
    # load dataframe
    # 40 个进程
    if trainortest == 'train':
        pool = ProcessPool(20)
        records = pool.map(load_train, sample_names)
        pool.close()
        pool.join()
    else :
        pool = ProcessPool(40)
        records = pool.map(load_test, sample_names)
        pool.close()
        pool.join()
    del pool
    gc.collect()
    Passenger_flow = [df.shape[0] for df in records]
    df['passenger_flow'] = Passenger_flow
    #   Parallel
    weeks, months, hours = [],[],[]
    for record in tqdm(records):
        ws,ms,hs = get_wmh(record)
        weeks.append(ws)
        months.append(ms)
        hours.append(hs)
    #wmhs = [get_wmh(record) for record in records]
    #df['weeks'] = [temp[0] for temp in wmhs]
    #df['months'] = [temp[1] for temp in wmhs]
    #df['hours'] = [temp[2] for temp in wmhs]
    df['weeks'] = weeks
    df['months'] = months
    df['hours'] = hours
    if trainortest == 'train':
        df['CategoryID'] = labels
    if trainortest == 'test':
        df['CategoryID'] = -1
    return df

In [113]:
%time train_visit = visit_preprocess('./train_visit/', 'train')

100%|██████████| 40000/40000 [31:33<00:00, 21.13it/s]  


CPU times: user 35min 47s, sys: 2min 50s, total: 38min 37s
Wall time: 39min 1s


In [111]:
%time test_visit = visit_preprocess('./test_visit/test_visit/', trainortest='test')

100%|██████████| 10000/10000 [07:46<00:00, 21.46it/s]


CPU times: user 8min 44s, sys: 39.4 s, total: 9min 23s
Wall time: 9min 24s


In [115]:
train_visit.to_csv('./train_visit.csv', index=False)
test_visit.to_csv('./test_visit.csv', index=False)