Exploration of the data from the [Di-Tech Challenge](http://research.xiaojukeji.com/competition), organized by Didi Chuxing, a ride-hailing company in China. The data is described [here](http://research.xiaojukeji.com/competition/detail.action?competitionId=DiTech2016).

In [None]:
import pandas as pd

# Warn about chained assignment?
# pd.options.mode.chained_assignment = None

# Order Info Table

<table>
        <tr>
            <th>Field</th>
            <th>Type</th>
            <th>Meaning</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>order_id</td>
            <td>string</td>
            <td>order ID</td>
            <td>70fc7c2bd2caf386bb50f8fd5dfef0cf</td>
        </tr>
        <tr>
            <td>driver_id</td>
            <td>string</td>
            <td>driver ID</td>
            <td>56018323b921dd2c5444f98fb45509de</td>
        </tr>
        <tr>
            <td>passenger_id</td>
            <td>string</td>
            <td>user ID</td>
            <td>238de35f44bbe8a67bdea86a5b0f4719</td>
        </tr>
        <tr>
            <td>start_district_hash</td>
            <td>string</td>
            <td>departure</td>
            <td>d4ec2125aff74eded207d2d915ef682f</td>
        </tr>
        <tr>
            <td>dest_district_hash</td>
            <td>string</td>
            <td>destination</td>
            <td>929ec6c160e6f52c20a4217c7978f681</td>
        </tr>
        <tr>
            <td>Price</td>
            <td>double</td>
            <td>Price</td>
            <td>37.5</td>
        </tr>
        <tr>
            <td>Time</td>
            <td>string</td>
            <td>Timestamp of the order</td>
            <td>2016-01-15 00:35:11</td>
        </tr>
</table>

The Order Info Table shows the basic information of an order, including the passenger and the driver (if driver_id =NULL, it means the order was not answered by any driver), place of origin, destination, price and time. The fields order_id, driver_id, passenger_id, start_hash, and dest_hash are made not sensitive.

In [None]:
# Columns in order files
columns = ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'price', 'time']

# pd.read_csv?

# Open only one file
# order_file_1 = "data/season_1/training_data/order_data/order_data_2016-01-01"
# df = df_1 = pd.read_csv(order_file_1, sep = "\t", names = columns, parse_dates = 'time')

# print(df.head(2))

In [None]:
# Files are organized by dates
# order_files = ["data/season_1/training_data/order_data/order_data_2016-01-{:02d}".format(i) for i in range(1, 22)]
order_files = ["data/season_1/training_data/order_data/order_data_2016-01-{:02d}".format(i) for i in range(1, 3)]

# Open all of them
order_dfs = []
for order_file in order_files:
    order_dfs.append(pd.read_csv(order_file, sep = "\t", names = columns))
df = pd.concat(order_dfs)

# Recognize time column as time
df['time'] = pd.to_datetime(df.time)

# Keep a random number of the rows
df_train = df.sample(frac = 0.70, random_state = 111)
df_valid = df.loc[~df.index.isin(df_train.index)]
df = df_train # avoid looking at validation set during the exploration

In [None]:
# Quick look at the data frame

print(df.head(2))
print(df.describe())
print("\nDates from {} to {}.".format(df['time'].min(), df['time'].max()))

In [None]:
# Count how many rows per order_id and driver_id
count = df[['order_id', 'driver_id']].groupby('order_id').count()
count = count['driver_id']

# Orders picked up by more than one driver?
print(sum(count > 1))
# Yes..? Surprising.

In [None]:
# Turns out there are duplicate and almost-duplicate entries. 
# For now, let's keep the last ones.
dup = df.duplicated(['order_id', 'driver_id', 'passenger_id', 'time'], keep = 'last')
df = df[~dup]
# Depending on the test data, it might be a better idea to leave them in.

In [None]:
# Count how many rows per order_id and driver_id
count = df[['order_id', 'driver_id']].groupby('order_id').count()
count = count['driver_id']

# Orders picked up by more than one driver?
print(sum(count > 1))
# No more.

# Create gap column
gap = (count == 0).astype('int').tolist()
df['gap'] = gap

print(df.describe())

In [None]:
# Proportion of orders not picked up by a driver
s = sum(count == 0)
l = len(count)

print("There are {} orders-without-drivers out of {} orders: {:.1%}.".format(s, l, s/l))
# It appears the gap is simply the number of orders not picked up.

In [None]:
# Compute time slot

# Extract the date, and implicitly make the time midnight.
df['date'] = pd.to_datetime(df.time.dt.date)
# df['timeonly'] = df.datetime.dt.time

# Is it a weekend or weekday?
df['weekend'] = df.time.dt.dayofweek
df['weekend'] = df.weekend >= 5

# One day is uniformly divided into 144 ten minute time slots.
df['timeslot'] = (df['time'] - df['date']).astype('timedelta64[m]')//10

# Drop the time column
# df = df.drop('time', axis = 1)

print(df.head(2))

In [None]:
# Compute gap per time slot per district
cols = ['start_district_hash', 'date', 'timeslot']
df_select = df[cols + ['gap']]
df_gap = df_select.groupby(cols).sum()

# Flatten data frame after the group by
df_gap = df_gap.reset_index()
print(df_gap.head(2))

# Sanity check: do the numbers add up?
print(sum(df_gap.gap))
# Yup.

# Merge back into main data frame
df = df.merge(df_gap, on = cols + ['gap'], how = 'left')

# District Info Table

<table>
        <tr>
            <th>Field</th>
            <th>Type</th>
            <th>Meaning</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>district_hash</td>
            <td>string</td>
            <td>District hash</td>
            <td>90c5a34f06ac86aee0fd70e2adce7d8a</td>
        </tr>
        <tr>
            <td>district_id</td>
            <td>string</td>
            <td>District ID</td>
            <td>1</td>
        </tr>
</table>

The District Info Table shows the information about the districts to be evaluated in the contest. You need to do the prediction given the districts from the District Definition Table. In the submission of the results, you need to map the district hash value to district mapped ID.

In [None]:
# Use the starting district_hash as the associated disctrict
df = df.rename(columns = {'start_district_hash': 'district_hash'})

# Load district conversion table
district_file = 'data/season_1/training_data/cluster_map/cluster_map'
district = pd.read_csv(district_file, sep = '\t', names = ['district_hash', 'district_id'])

# Replace district_hash by district_id in data frame
df = df.merge(district, on = 'district_hash', how = 'left')
df = df.drop('district_hash', axis = 1)

print(df.head(2))

# Predictions
<table>
        <tr>
            <th>Data name</th>
            <th>Data type</th>
            <th>Example</th>
        </tr>
        <tr>
            <td>District ID</td>
            <td>string</td>
            <td>1,2,3,4 (the same as district mapping ID)</td>
        </tr>
        <tr>
            <td>Time slot</td>
            <td>string</td>
            <td>2016-01-23-1 (The first time slot on Jan. 23rd, 2016; one day is uniformly divided into 144 ten minute time slots)</td>
        </tr>
        <tr>
            <td>Prediction value</td>
            <td>double</td>
            <td>6.0</td>
        </tr>
</table>

In [None]:
# Make first prediction by simply taking the mean per district_id per timeslot per weekend
cols = ['district_id', 'weekend', 'timeslot']

df_select = df[cols + ['gap']]
gap_mean = df_select.groupby(cols).mean().reset_index()
gap_mean = gap_mean.rename(columns = {'gap': 'gap_mean'})

df = df.merge(gap_mean, on = cols, how = 'left')

print(df.head(2))

In [None]:
# Setup training set, labels, and predictions.
train_data = df[['district_id', 'date', 'timeslot']]
train_outcome = df['gap']
train_predict = df['gap_mean']

In [None]:
# Make the date - timeslot column for predictions
train_data['datetimeslot'] = df.date.dt.date.map(str) + '-' + df.timeslot.astype(int).map(str)

Consider di districts and tj time slots, and the supply-demand gap gapij , and your prediction is sij, we use as the evaluation metrics: 
![MAPE](figures/mape.jpg)
The lowest MAPE will be the best.

In [None]:
def MAPE(outcome, predict):
    # Compute MAPE score. Lower is better.
    import numpy as np
    diff = (outcome - predict) / outcome.replace({0: np.nan})
    diff = diff.replace({np.nan: 0})
    return diff.mean() # Compute the average over all district and timeslots

mape = MAPE(train_outcome, train_predict)
print(mape)