In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing
import matplotlib.pyplot as plt
import lightgbm as lgb
import datetime
from datetime import timedelta

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Preprocess

In [156]:
data = pd.read_csv('/Users/leo/Desktop/DSGA_1001/Project/data/data_train.csv')

getDay = lambda x: dt.datetime.strptime(x[:-5], '%Y-%m-%d %H:%M:%S').day
getHour = lambda x: dt.datetime.strptime(x[:-5], '%Y-%m-%d %H:%M:%S').hour

data['Start Day'] = list(map(getDay, data['starttime']))
data['Start Hour'] = list(map(getHour, data['starttime']))

In [157]:
data_val = pd.read_csv('/Users/leo/Desktop/DSGA_1001/Project/data/data_val.csv')

getDay = lambda x: dt.datetime.strptime(x[:-5], '%Y-%m-%d %H:%M:%S').day
getHour = lambda x: dt.datetime.strptime(x[:-5], '%Y-%m-%d %H:%M:%S').hour

data_val['Start Day'] = list(map(getDay, data_val['starttime']))
data_val['Start Hour'] = list(map(getHour, data_val['starttime']))

In [5]:
features = ['start region', 'start station latitude',
            'start station longitude', 'usertype', 'birth year',
            'gender', 'Start Day', 'Start Hour']

In [158]:
data.dropna(inplace=True)
data_val.dropna(inplace=True)

In [7]:
le_user = preprocessing.LabelEncoder()
le_user.fit(data['usertype'])
data['usertype'] = le_user.transform(data['usertype'])

le_gender = preprocessing.LabelEncoder()
le_gender.fit(data['gender'])
data['gender'] = le_gender.transform(data['gender'])

# le_start_station = preprocessing.LabelEncoder()
# le_start_station.fit(data['start region'])
# data['start region'] = le_start_station.transform(data['start region'])

# le_end_station = preprocessing.LabelEncoder()
# le_end_station.fit(data['end region'])
# data['end region'] = le_end_station.transform(data['end region'])

In [176]:
le_user1 = preprocessing.LabelEncoder()
le_user1.fit(data_val['usertype'])
data_val['usertype'] = le_user1.transform(data_val['usertype'])

le_gender1 = preprocessing.LabelEncoder()
le_gender1.fit(data_val['gender'])
data_val['gender'] = le_gender1.transform(data_val['gender'])

# le_start_station1 = preprocessing.LabelEncoder()
# le_start_station1.fit(data1['start region'])
# data1['start region'] = le_start_station1.transform(data1['start region'])

# le_end_station1 = preprocessing.LabelEncoder()
# le_end_station1.fit(data1['end region'])
# data1['end region'] = le_end_station1.transform(data1['end region'])

In [177]:
X_train = data[features]
X_val = data_val[features]
y_train = data['start region']
y_val = data_val['end region']

In [68]:
X_train_withtime = X_train.copy()
X_train_withtime['starttime'] = data['starttime']

In [161]:
X_val_withtime = X_val.copy()
X_val_withtime['starttime'] = data_val['starttime']

In [11]:
region_dict = {}
for i in range(X_train.shape[0]):
    region_dict[X_train.iloc[i, 0]] = [X_train.iloc[i, 1], X_train.iloc[i, 2]]

## Stacked Probabilistic Model

In [178]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=['start region', 'usertype', 'gender', 'start region',
                                                               'Start Hour'], free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False)

In [179]:
params = {
 'task': 'train',
 'objective':'multiclass',
 'metric': 'multi_logloss',
 'num_class': 32,
 'verbose': 1,
 'max_bin': 128,  # 大会有更准的效果,更慢的速度
 'learning_rate': 0.01,  # 学习率
 'num_leaves': 64,  # 大会更准,但可能过拟合,
 'num_iterations': 1000,
 'early_stopping_round': 2,
}

In [None]:
gbm = lgb.train(params,
                lgb_train, valid_sets = [lgb_eval], verbose_eval = True)

In [None]:
# Assumption functions

def get_duration(station1, station2, region_dict):
    lat = region_dict[station1][0] - region_dict[station2][0]
    long = region_dict[station1][1] - region_dict[station2][1]
    return np.sqrt(lat**2 + long**2) * 1000

def user_end_distribution(model, data):
    # Station id and outflow percentage
    return model.predict(data)

# get data from data frame
# return a list of all data
# start_time_min/max must be datetime objects
def get_unlock_data(start_time_min, start_time_max, start_station, full_data):
    if type(start_time_min) != dt.datetime or type(start_time_max) != dt.datetime:
        raise TypeError('Not datetime object.')
    data_for_station = full_data[full_data['start region'] == start_station]
    data_for_station['starttime'] = [dt.datetime.strptime(date[:-5], '%Y-%m-%d %H:%M:%S') for date in data_for_station['starttime']]
    data_list = data_for_station[data_for_station['starttime'] > start_time_min]
    data_list = data_list[data_list['starttime'] < start_time_max]
    data_list = data_list.iloc[:, :-1]
    return data_list

# subtract duration(minutes) from time correclty
def subtract_time(time, input_minutes):
    if type(time) != dt.datetime:
        raise TypeError('Not datetime object.')
    datetime_minutes = timedelta(minutes = input_minutes)
    adjusted_minutes = time - datetime_minutes
    return adjusted_minutes

# add duration(minutes) to time correclty
def add_time(time, input_minutes):
    if type(time) != dt.datetime:
        raise TypeError('Not datetime object.')
    datetime_minutes = timedelta(minutes = input_minutes)
    adjusted_minutes = time + datetime_minutes
    return adjusted_minutes

# Compare time
def smaller_than_time(time1, time2):
    if type(time1) != dt.datetime or type(time2) != dt.datetime:
        raise TypeError('Not datetime object.')
    return time1 < time2

def get_historical_inflow(start_station, end_station, end_time):
    adjust_term = 5
    return adjust_term

In [193]:
# all_stations: a list of all stations
# interval: mins (+- 10 mins)
def calculate_inflow(predTime, curTime, end_station, all_stations, interval):
    ## look up how to calculate time
    # Calculate distribution for all stations
    inflow = 0
    ratio = 1
    for start_station in all_stations:
        trip_duration = get_duration(start_station, end_station, region_dict)
        start_time = subtract_time(predTime, trip_duration)
        # use historical data if now is earlier than start time:
        if smaller_than_time(predTime, start_time):
            inflow += get_historical_inflow(start_station, end_station, predTime)
        else:
            ratio = 30 / interval
            min_time = subtract_time(start_time, interval / 2)
            #print(min_time)
            max_time = add_time(start_time, interval / 2)
            #print(max_time)
            unlock_list = get_unlock_data(min_time, 
                                          max_time,
                                          start_station,
                                          X_val_withtime)
            # add inflow for every user from the same start station
            for i in range(unlock_list.shape[0]):
                unlocked_user = unlock_list.iloc[i,:]
                proba = list(user_end_distribution(gbm, unlocked_user))[0]
                inflow += proba[end_station]
    inflow = inflow * ratio
    return inflow

## Evaluation & Calculation

#### calculate_inflow will return the predicted inflow at "predTime" for station "station_idx", assuming the current time is "curTime". The time window for collecting real data is determined by "window"

In [196]:
predTime = '2018-07-1 20:30:00.0700'
station_idx = 11
window = 10
predTime = dt.datetime.strptime(predTime[:-5], '%Y-%m-%d %H:%M:%S')
curTime = predTime - timedelta(minutes = 60)
all_stations = list(range(1,32))

In [None]:
predflowlst = []
for i in range(0,24):
    predTime = '2018-07-15 {}:00:00.0700'.format(i)
    station_idx = 11
    window = 10
    predTime = dt.datetime.strptime(predTime[:-5], '%Y-%m-%d %H:%M:%S')
    curTime = predTime - timedelta(minutes = 60)
    all_stations = list(range(1,32))
    predflow = calculate_inflow(predTime, curTime, station_idx, all_stations, window)
    predflowlst.append(predflow)
    print(predflowlst) 

In [None]:
predflowlst2 = []
for i in range(0,24):
    predTime = '2018-07-20 {}:00:00.0700'.format(i)
    station_idx = 11
    window = 10
    predTime = dt.datetime.strptime(predTime[:-5], '%Y-%m-%d %H:%M:%S')
    curTime = predTime - timedelta(minutes = 60)
    all_stations = list(range(1,32))
    predflow = calculate_inflow(predTime, curTime, station_idx, all_stations, window)
    predflowlst2.append(predflow)    