Exploration of the data from the [Di-Tech Challenge](http://research.xiaojukeji.com/competition), organized by Didi Chuxing, a ride-hailing company in China. The data is described [here](http://research.xiaojukeji.com/competition/detail.action?competitionId=DiTech2016).

In [None]:
from time import clock
import numpy as np
import pandas as pd

# Warn about chained assignment in pandas?
# pd.options.mode.chained_assignment = None

# Order Info Table

<table>
        <tr>
            <th>Field</th>
            <th>Type</th>
            <th>Meaning</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>order_id</td>
            <td>string</td>
            <td>order ID</td>
            <td>70fc7c2bd2caf386bb50f8fd5dfef0cf</td>
        </tr>
        <tr>
            <td>driver_id</td>
            <td>string</td>
            <td>driver ID</td>
            <td>56018323b921dd2c5444f98fb45509de</td>
        </tr>
        <tr>
            <td>passenger_id</td>
            <td>string</td>
            <td>user ID</td>
            <td>238de35f44bbe8a67bdea86a5b0f4719</td>
        </tr>
        <tr>
            <td>start_district_hash</td>
            <td>string</td>
            <td>departure</td>
            <td>d4ec2125aff74eded207d2d915ef682f</td>
        </tr>
        <tr>
            <td>dest_district_hash</td>
            <td>string</td>
            <td>destination</td>
            <td>929ec6c160e6f52c20a4217c7978f681</td>
        </tr>
        <tr>
            <td>Price</td>
            <td>double</td>
            <td>Price</td>
            <td>37.5</td>
        </tr>
        <tr>
            <td>Time</td>
            <td>string</td>
            <td>Timestamp of the order</td>
            <td>2016-01-15 00:35:11</td>
        </tr>
</table>

The Order Info Table shows the basic information of an order, including the passenger and the driver (if driver_id =NULL, it means the order was not answered by any driver), place of origin, destination, price and time. The fields order_id, driver_id, passenger_id, start_hash, and dest_hash are made not sensitive.

In [None]:
# Columns in order files
columns = ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'price', 'time']

# Open only one file
# order_file_1 = "data/season_1/training_data/order_data/order_data_2016-01-01"
# df = df_1 = pd.read_csv(order_file_1, sep = "\t", names = columns, parse_dates = 'time')

# Files are organized by dates
# n_files = 22
# order_files = ["data/season_1/training_data/order_data/order_data_2016-01-{:02d}".format(i) 
#                for i in range(1, n_files)]
order_files = ["data/season_1/training_data/order_data/order_data_2016-01-{:02d}".format(i) 
               for i in range(10, 16)]


# Open all of them
order_dfs = []
for order_file in order_files:
    order_dfs.append(pd.read_csv(order_file, sep = "\t", names = columns))
df = pd.concat(order_dfs)

# Recognize time column as time
df['time'] = pd.to_datetime(df.time)

In [None]:
# Attach time-related info 

def compute_timeinfo(df):
    # Input: data frame with time column.
    
    # Compute 10-min timeslots -- absolute and per day.
    df['timeslot_absolute'] = (df['time'] - pd.to_datetime('2016-01-01')).astype('timedelta64[m]')//10
    df['timeslot_day'] = df['timeslot_absolute'] % (24 * 6)
    
    # Determine day of week, weekend.
    df['dow'] = df.time.dt.dayofweek
    df['weekend'] = df.dow >= 5
    
compute_timeinfo(df)

# Set for training, validating, testing

In [None]:
# Keep first two weeks for training, next one week for validation.
ind = df['time'] < pd.to_datetime('2016-01-15')
df_train = df[ind]
df_valid = df[~ind]

# Avoid looking at validation set during the exploration
df = df_train

In [None]:
# Open list of slots for test set
order_file = 'data/season_1/test_set_1/read_me_1.txt'
df_test = pd.read_csv(order_file, sep = "\t", names = ['datetimeslot'], skiprows = 1)

# Extract date and timeslot [0,143]
df_test['date'] = pd.to_datetime(df_test.datetimeslot.str[:10])
df_test['timeslot'] = df_test.datetimeslot.str[11:]
df_test['timeslot'] = df_test.timeslot.astype(int)

# Day of week
df_test['dow'] = df_test.date.dt.dayofweek
df_test['weekend'] = df_test.dow >= 5

# Gap

In [None]:
def compute_gap(df):
    # Compute gap per time slot per district
    # Input: data frame with ['driver_id', 'timeslot_absolute', 'start_district_hash']
    
    cols = ['start_district_hash', 'timeslot_absolute']
    
    df['is_gap'] = df['driver_id'].isnull()
    df_grouped = df.groupby(cols)
    df['gap'] = df_grouped['is_gap'].transform('sum')
    
# Apply to training and validation set
compute_gap(df_train)
compute_gap(df_valid)

In [None]:
def rolling_thing_by_basis(thing, basis, window, func, *args, **kwargs):
    # Roll function funct over thing column by basis column for a window range [a,b] with a<b.
    # Indexing makes rolling faster, but basis must be sorted.
    
    # http://stackoverflow.com/questions/14300768/
    # pandas-rolling-computation-with-window-based-on-values-instead-of-counts

    indexed_thing = pd.Series(thing.values, index = basis.values)
    
    def apply_window(val):
        # slice_indexer instead of thing.loc[val:val+window] allows window limits not in the index
        indexer = indexed_thing.index.slice_indexer(val + window[0], val + window[1], 1)
        chunk = indexed_thing[indexer]
        return func(chunk, *args, **kwargs)
    
    rolled = basis.apply(apply_window)
    return rolled

d = pd.DataFrame({'gap': [10,11,12,13], 'timeslot': [1,2,3,4]}).sort_values(by = 'timeslot')

# Sum the prior 3 entries
d['applied'] = rolling_thing_by_basis(d.gap, d.timeslot, [-3, -1], np.sum)
d

In [None]:
def rolling_thing_by_basis(thing, basis, window, func):
    # Roll function funct over thing column by basis column for a window range [a,b] with a<b.
    # http://stackoverflow.com/questions/14300768/
    # pandas-rolling-computation-with-window-based-on-values-instead-of-counts
    
    def apply_window(val):        
        # chunk = thing[(val+window[0] <= basis) & (basis <= val+window[1])]
        chunk = thing[(basis >= val+window[0]) & (basis <= val+window[1])]
        # chunk = thing[(basis <= val) & (basis >= val-3)]
        return func(chunk)
    
    return basis.apply(apply_window)

d = pd.DataFrame({'gap': [10,11,12,13], 'timeslot': [1,2,3,4]})

# Sum the prior 3 entries
d['applied'] = rolling_thing_by_basis(d.gap, d.timeslot, [-3, -1], np.sum)
d

In [None]:
# Compute rolling average

def compute_prior_mean(df):
    df = df.sort_values(by = 'timeslot_absolute')    
    df_grouped = df.groupby('start_district_hash')
    
    # df['gap_mean_prior'] = df_grouped.apply(rolling_thing_by_basis, 
    #                                         df_grouped['gap'], df_grouped['timeslot_absolute'], [-3, -1], np.mean)
    
    # df['gap_mean_prior'] = 
    df_grouped.apply(
        lambda x: rolling_thing_by_basis(df_grouped.get_group(x).gap, df_grouped.get_group(x).timeslot_absolute, [-3, -1], np.mean))
    
    # df['gap_mean_prior'] = df_grouped.gap.apply(pd.rolling_sum, window = 4, min_periods = 1)
    # df['gap_mean_prior'] = (df['gap_mean_prior'] - df['gap'])/3
    
    # df['gap_mean_prior'] = df_grouped.gap.apply(pd.rolling, window = 4, win_type = [1, 1, 1, 0]).mean()
    
    # df_pivoted = pd.pivot_table(df, index = 'timeslot_absolute', columns = 'start_district_hash', values = 'gap')
    # return df_pivoted.rolling(window = 4, win_type = [1,1,1,0]).mean().stack

    # def f(x):
    #     filtered = df_grouped.gap[df_grouped.timeslot < x & df_grouped.timeslot >= x-3]
    #     return filtered.sum()

df_train.head(10)
compute_prior_mean(df_train)
# df_train.head(10)

In [None]:
cols = ['start_district_hash', 'timeslot_absolute']
df_sorted = df.sort_values(by = 'timeslot_absolute')    
df_grouped = df_sorted.groupby(cols)['gap'].mean().reset_index()
df_grouped = df_grouped.groupby('start_district_hash')
df_grouped['gap_mean_prior'] = df_grouped['gap'].apply(pd.rolling_mean, window = 3)
df_grouped['gap_mean_prior'] = df_grouped['gap_mean_prior'] / 3
# df_grouped['gap_mean_prior'][df_rolling.isnull()]
df_rolling = df_grouped['gap_mean_prior'].fillna(method = 'bfill')
df_rolling = df_rolling.reset_index()
# df['gap_mean_prior'] = df_rolling
df.drop('start_district_hash', axis = 1, inplace = True)
df.merge(df_rolling[cols + ['gap_mean_prior']], by = cols, on = 'left')

# Weather

<table>
        <tr>
            <th>Field</th>
            <th>Type</th>
            <th>Meaning</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>Time</td>
            <td>string</td>
            <td>Timestamp</td>
            <td>2016-01-15 00:35:11</td>
        </tr>
        <tr>
            <td>Weather</td>
            <td>int</td>
            <td>Weather</td>
            <td>7</td>
        </tr>
        <tr>
            <td>temperature</td>
            <td>double</td>
            <td>Temperature</td>
            <td>-9</td>
        </tr>
        <tr>
            <td>PM2.5</td>
            <td>double</td>
            <td>pm25</td>
            <td>66</td>
        </tr>
</table>

The Weather Info Table shows the weather info every 10 minutes each city. The weather field gives the weather conditions such as sunny, rainy, and snowy etc; all sensitive information has been removed. The unit of temperature is Celsius degree, and PM2.5 is the level of air pollutions.

In [None]:
# Files are organized by dates
n_files = 21
weather_files = ["data/season_1/training_data/weather_data/weather_data_2016-01-{:02d}".format(i)
                 for i in range(1, n_files)]

# Open all of them
columns = ['time', 'weather', 'temperature', 'pm25']
weather_dfs = []
for f in weather_files:
    weather_dfs.append(pd.read_csv(f, sep = "\t", names = columns))
dfw = pd.concat(weather_dfs)

# Extract date and timeslot
dfw['time'] = pd.to_datetime(dfw.time)
compute_timeinfo(dfw)

# Merge into main data frame, and fill missing values
# http://pandas.pydata.org/pandas-docs/stable/missing_data.html
df = df.merge(dfw, on = ['timeslot_absolute'], how = 'left')
df.temperature = df.temperature.fillna(method = 'ffill') # forward fill
# df.temperature = df.temperature.interpolate(method = 'time') # time-based interpolation

# Categorical Variables

In [None]:
# Number of most popular districts to keep
k = 10

# Rank
df['start_district_count'] = df.groupby('start_district_hash')['start_district_hash'].transform('count')
df['start_district_rank'] = df['start_district_count'].rank(ascending = False, method = 'dense')

# Extract most popular districts
num_entries = df.shape[0]
df_filtered = df[df['start_district_rank'] <= k]
num_top = df_filtered.groupby('start_district_hash')['start_district_count'].mean().sum()
districts_top = df_filtered['start_district_hash']

# Preparation for Validation

In [None]:
# Compute time slot
compute_timeinfo(dfw)

# Compute gap per time slot per district
compute_gap(df_valid)

# Merge temperature
df_valid = df_valid.merge(dfw, on = ['timeslot_absolute'], how = 'left')
df_valid.temperature = df_valid.temperature.fillna(method = 'ffill')

# One-hot encoding of districts
# df_valid['district'] = np.nan
# df_valid.loc[df_valid.start_district_hash.isin(districts), 'district'] = \
#     df_valid.loc[df_valid.start_district_hash.isin(districts), 'start_district_hash']
# dummies = pd.get_dummies(df_valid['district'], dummy_na = False)
# df_valid = pd.concat((df_valid.drop('district', axis = 1), dummies.astype(int)), axis = 1)

# Replace district by popularity
districts_rank = df[['start_district_hash', 'district_rank']]
districts_rank = districts_rank.drop_duplicates(subset = ['start_district_hash'], keep = 'first')
districts_rank = districts_rank.set_index('start_district_hash')['district_rank']
df_valid['start_district_rank'] = districts_rank[df_valid.start_district_hash].reset_index()['district_rank']

# Predictions by Clusters

In [None]:
# Make first prediction by simply taking the prior mean per start_district_hash
train_outcome = df_cluster['gap']
train_predict = df_cluster['gap_mean_prior']

# Validation
valid_outcome = df_valid_cluster['gap']
valid_predict = df_valid_cluster['gap_mean_prior']

# Prediction with sklearn

In [None]:
# Select features
cols = ['start_district_rank', 'dow', 'timeslot', 'temperature', 'gap_mean_prior']
# cols = ['start_district_rank', 'weekend', 'timeslot', 'temperature', 'gap_mean_prior']
train = df[cols]

# Select regressor
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 10)
# from sklearn.tree import DecisionTreeRegressor
# reg = DecisionTreeRegressor(max_depth = 3)

# Fit training data
start = clock()
reg.fit(train, train_outcome)
print("Fit in {:.0f} seconds.".format(clock() - start))

# Extrapolate to test data
start = clock()
train_predict = reg.predict(train)
print("Extrapolate in {:.0f} seconds.".format(clock() - start))

# Reference outcome
train_outcome = df['gap']

# Validation
valid_outcome = df_valid['gap']
valid_predict = reg.predict(df_valid[cols])

# Evaluation
Consider di districts and tj time slots, and the supply-demand gap gapij , and your prediction is sij, we use as the evaluation metrics: 
![MAPE](figures/mape.jpg)
The lowest MAPE will be the best.

In [None]:
def mape(outcome, predict):
    # Compute MAPE score. Lower is better.
    import numpy as np
    
    # Compute errors summand for summand with nonzero denominator
    diff = (outcome - predict) / outcome.replace({0: np.nan})
    diff = diff.replace({np.nan: 0})
    diff = diff.abs()
    
    # Compute the average over all district and timeslots for which outcome is NONZERO
    nq = len(outcome.nonzero()[0])
    return diff.sum() / nq

# As of June 7th...
# lowest score online is 0.224257,
# 100th is 0.27747,
# 500th is 0.360159.

score = mape(train_outcome, train_predict)
print("Training MAPE: {:.6f}".format(score))

score = mape(valid_outcome, valid_predict)
print("Validation MAPE: {:.6f}".format(score))

# District Info Table

<table>
        <tr>
            <th>Field</th>
            <th>Type</th>
            <th>Meaning</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>district_hash</td>
            <td>string</td>
            <td>District hash</td>
            <td>90c5a34f06ac86aee0fd70e2adce7d8a</td>
        </tr>
        <tr>
            <td>district_id</td>
            <td>string</td>
            <td>District ID</td>
            <td>1</td>
        </tr>
</table>

The District Info Table shows the information about the districts to be evaluated in the contest. You need to do the prediction given the districts from the District Definition Table. In the submission of the results, you need to map the district hash value to district mapped ID.

In [None]:
# Use the starting district_hash as the associated disctrict
district_col = 'start_district_hash'

# Load district conversion table
district_file = 'data/season_1/training_data/cluster_map/cluster_map'
district = pd.read_csv(district_file, sep = '\t', names = [district_col, 'district_id'])

# How many districts?
print("There are {} districts in the district file.".format(district.shape[0]))

# Replace district_hash by district_id in data frame
df = df.merge(district, on = district_col, how = 'left')

df.head(2)

# Output
<table class="table table-2">
        <tr>
            <th>Data name</th>
            <th>Data type</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>District ID</td>
            <td>string</td>
            <td>1,2,3,4 (the same as district mapping ID)</td>
        </tr>
        <tr>
            <td>Time slot</td>
            <td>string</td>
            <td>2016-01-23-1 (The first time slot on Jan. 23rd, 2016; one day is uniformly divided into 144 ten minute time slots)</td>
        </tr>
        <tr>
            <td>Prediction value</td>
            <td>double</td>
            <td>6.0</td>
        </tr>
</table>

In [None]:
# Select prediction
col_predict = 'gap_mean_prior'
df_test = df

# Compute time slot
# One day is uniformly divided into 144 ten minute time slots. Indexed from 1 to 144.
df_test['timeslot_output'] = df_test['timeslot_day'] + 1

# Make the date - timeslot column
df_test['datetimeslot'] = df_test.date.map(str) + '-' + df_test.timeslot_output.astype(int).map(str)

# Prepare output file
cols = ['district_id', 'datetimeslot']
final = df_test[cols + [col_predict]].groupby(cols).mean().reset_index()
final.to_csv("predict.csv", index = False, header = False)