In [1]:
import time
import sys
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split, KFold
import multiprocessing
import os
from tqdm import tqdm



In [4]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False

In [5]:
weather_file = pd.read_csv("Data/weather.csv", sep=',')
weather_data = weather_file.drop('codesum', axis = 1) # codesum 컬럼 삭제
    
data = weather_data.values[:,2:] # index, station_nbr, date를 제외한 나머지 데이터 array로 변경
# 데이터가 있을 경우 float형으로 변경, 데이터가 없을 경우 unicodedata로 변경
for col in range(data.shape[1]):
    for row in range(data.shape[0]):
        if not is_number(data[row, col]):
            if row == 0:
                i = 1
                while not is_number(data[row+i, col]):
                    i += 1
                data[row, col] = data[row+i, col]
            else:
                data[row, col] = data[row-1, col]
        data[row, col] = float(data[row, col]) # Convert string to float value

weather_processed = np.append(weather_file.values[:,0:2], data, axis = 1)
df = pd.DataFrame(weather_processed, index=None, columns=weather_data.columns.values)
    
df

Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,2012-01-01,52,31,42,16,36,40,23,0,716,1626,0,0.05,29.78,29.92,3.6,20,4.6
1,2,2012-01-01,48,33,41,16,37,39,24,0,716,1626,0,0.07,28.82,29.91,9.1,23,11.3
2,3,2012-01-01,55,34,45,9,24,36,20,0,735,1720,0,0,29.77,30.47,9.9,31,10
3,4,2012-01-01,63,47,55,4,28,43,10,0,728,1742,0,0,29.79,30.48,8,35,8.2
4,6,2012-01-01,63,34,49,0,31,43,16,0,727,1742,0,0,29.95,30.47,14,36,13.8
5,7,2012-01-01,50,33,42,0,26,35,23,0,727,1742,0,0,29.15,30.54,10.3,32,10.2
6,8,2012-01-01,66,45,42,0,34,46,23,0,727,1742,0,0,30.05,30.54,11,36,10.9
7,9,2012-01-01,34,19,27,0,17,23,38,0,727,1742,0,0,29.34,30.09,22.8,30,22.5
8,10,2012-01-01,73,53,63,0,55,58,2,0,723,1738,0,0,30.16,30.19,5.1,24,5.5
9,11,2012-01-01,72,48,60,7,54,56,5,0,724,1737,0,0,30.15,30.18,4.6,23,4.8


In [6]:
def process_weather():
    weather_file = pd.read_csv("Data/weather.csv", sep=',')
    weather_data = weather_file.drop('codesum', axis = 1) # codesum 컬럼 삭제
    
    data = weather_data.values[:,2:] # index, station_nbr, date를 제외한 나머지 데이터 array로 변경
    # 데이터가 있을 경우 float형으로 변경, 데이터가 없을 경우 unicodedata로 변경
    for col in range(data.shape[1]):
        for row in range(data.shape[0]):
            if not is_number(data[row, col]):
                if row == 0:
                    i = 1
                    while not is_number(data[row+i, col]):
                        i += 1
                    data[row, col] = data[row+i, col]
                else:
                    data[row, col] = data[row-1, col]
            data[row, col] = float(data[row, col]) # Convert string to float value

    weather_processed = np.append(weather_file.values[:,0:2], data, axis = 1)
    df = pd.DataFrame(weather_processed, index=None, columns=weather_data.columns.values)
    df.to_csv("Data/weather_processed.csv", sep=',', index=None)

In [7]:
#index를 제외한 key데이터를 dataframe로 가져옴
def match_store_station():
    key_file = pd.read_csv("Data/key.csv", sep=',')
    df = pd.DataFrame(key_file.values[:,1], index = key_file.values[:,0])
    return df

In [8]:
def read_processed_weather():
    weather_file = pd.read_csv("Data/weather_processed.csv", sep = ',')
    #date와 station_nbr정보를 가져옴
    multi_index = [weather_file.values[:,1], weather_file.values[:,0]]
    #index를 date와 station_nbr기준으로 하여 데이터를 가져옴
    df = pd.DataFrame(weather_file.values[:,2:], index=multi_index, columns=weather_file.columns.values[2:])
    return df

In [9]:
def get_weather_data(date, store_nbr, store_station_pairs, weather_file):
    station_nbr = store_station_pairs.loc[store_nbr].values[0]
    weather_data = weather_file.loc[date, station_nbr].values
    return weather_data

In [10]:
def get_item_list():
    #train data에서 item_nbr data중복 제거 후 list로 변환
    train_file = pd.read_csv("Data/train.csv", sep=',')
    item_list = list(set(list(train_file.values[:,2])))
    return item_list

In [11]:
def get_item_data(item_nbr, percent, store_station_pairs, weather_file, train_file):
    start = time.time()
    train_num = train_file.values[:,1].shape[0]

    item_index = np.where(train_file.values[:,2] == item_nbr)
    item_index = item_index[0]
    num = int(item_index.shape[0]*(1-percent)) # Select according to the given percentage, need to be improved
    iter_list = range(num, item_index.shape[0])
    item_dataset = np.zeros((len(iter_list), weather_file.values.shape[1]+1))
    count = 0
    #print len(iter_list)
    for i in iter_list:
        index = item_index[i]
        #print count
        date = train_file.values[:,0][index]
        store_nbr = train_file.values[:,1][index]
        units = train_file.values[:,3][index]

        weather_data = get_weather_data(date, store_nbr, store_station_pairs, weather_file)

        item_dataset[count,:] =  np.append(weather_data, [units], axis=0)
        count += 1
    end = time.time()
    #print "Running time for item %d: %f" %(int(item_nbr), (end-start))
    return item_dataset

In [12]:
def write_item_data(tasks):
    item_list = tasks.get()
    #print item_list
    store_station_pairs = match_store_station()
    weather_file = read_processed_weather() # index has 2 columns and values has 17 columns
    train_file = pd.read_csv("Data/train.csv", sep=',')

    #item_list = get_item_list()
    #item_nbr = item_list[0]
    for item_nbr in item_list:
        item_dataset = get_item_data(item_nbr, 0.1, store_station_pairs, weather_file, train_file) # Give percentage
        df = pd.DataFrame(item_dataset, index=None, columns=None)
        filename = "Item_data/item_%d.csv" %(int(item_nbr))
        df.to_csv(filename, sep=',', index=None, columns=None)

In [13]:
def multi_process(traget_func):
    processors = multiprocessing.cpu_count()

    myTasks = multiprocessing.Queue()
    item_list = get_item_list()
    temp_part = []
    div = len(item_list)/(processors-1)
    rem = len(item_list)%(processors-1)
    ind = 0
    while ind < div*(processors-1): 
        temp_part.append(item_list[ind:ind+div])
        ind = ind+div
    temp_part.append(item_list[(len(item_list)-rem):])

    for each in temp_part:
        myTasks.put(each)

    Workers = [multiprocessing.Process(target = traget_func, args =(myTasks,)) for i in range(processors)]

    #Workers[0].start()

    for each in Workers:
        each.start()

In [14]:
def read_item_data(item_nbr):
    filename = "Item_data/item_%d.csv" %(int(item_nbr))
    item_dataset = pd.read_csv(filename, sep=',')

    item_data = item_dataset.values[:,0:-1]
    item_label = item_dataset.values[:,-1]
    return item_data, item_label

In [15]:
def split_item_data(item_nbr, type):
    item_data, item_label = read_item_data(item_nbr)
    X_train, X_test, y_train, y_test = train_test_split(item_data, item_label, test_size=0.25, random_state=42)
    if type == 'train':
        return X_train, y_train
    elif type == 'test':
        return X_test, y_test

In [24]:
def get_full_data():
    train_file = pd.read_csv("Data/train.csv", sep=',')
    train_file = train_file[train_file['units'] > 0]

    store_station_pairs = match_store_station()
    #weather_file = read_weather() # index has 2 columns and values has 18 columns
    weather_file = read_processed_weather()
    train_num = train_file.values[:,1].shape[0]
    item_dataset = [] # Should be 20 columns
    for ind in tqdm(range(train_num)):
        store_nbr = train_file.values[:,1][ind]
        date = train_file.values[:,0][ind]        
        weather_data = get_weather_data(date, store_nbr, store_station_pairs, weather_file)
        #weather_data = np.append(train_file.values[:,2:][ind], weather_data, axis = 1) #[item_nbr, units, weather_features]
        weather_data = np.concatenate((train_file.values[:,2:][ind], weather_data), axis = 0)
        item_dataset.append(weather_data)

    item_dataset = np.array(item_dataset)
    #train_columns = np.append(np.array(['item_nbr','units']), weather_file.columns.values, axis=1)
    train_columns = np.concatenate((np.array(['item_nbr','units']), weather_file.columns.values), axis=0)
    train_df = pd.DataFrame(item_dataset, index=None, columns=train_columns) # Build item_nbr index on units and weather features
    train_df.to_csv("test_train_feature_matrix.csv", sep=',', index=None)
    return train_df

In [None]:
get_full_data()


  0%|          | 0/118696 [00:00<?, ?it/s][A
  0%|          | 7/118696 [00:00<32:24, 61.03it/s][A
  0%|          | 14/118696 [00:00<31:36, 62.58it/s][A
  0%|          | 21/118696 [00:00<31:35, 62.61it/s][A
  0%|          | 28/118696 [00:00<31:23, 63.00it/s][A
  0%|          | 35/118696 [00:00<31:26, 62.91it/s][A
  0%|          | 42/118696 [00:00<31:35, 62.61it/s][A
  0%|          | 49/118696 [00:00<31:27, 62.84it/s][A
  0%|          | 56/118696 [00:00<31:28, 62.82it/s][A
  0%|          | 62/118696 [00:00<31:37, 62.52it/s][A
  0%|          | 69/118696 [00:01<31:34, 62.61it/s][A
  0%|          | 76/118696 [00:01<31:38, 62.47it/s][A
  0%|          | 83/118696 [00:01<31:37, 62.50it/s][A
  0%|          | 90/118696 [00:01<31:36, 62.55it/s]Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/woongkook/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/woongkook/anaconda3/lib/python3.6/site-packages/tqdm/_

In [16]:
def read_full_data():
    train_features = pd.read_csv("train_feature_matrix.csv", sep=',')
    train_df = pd.DataFrame(train_features.values[:,1:], index=train_features.values[:,0], columns=train_features.columns.values[1:])
    
    return train_df    