# Step1: Data preprocessing

(Using Python 3.12.11 )

## 1.1. Combine Dataset

From cyclehire-cleandata(saved in 'row data' folder), check the total numer and combined data in one file('merged_2018_2021.csv')

In [None]:
import pandas as pd
import glob

file_list = sorted(glob.glob('raw_data/[2][0][1][8-9]_??.csv') + glob.glob('data_propress/raw_data/[2][0][2][0-1]_??.csv'))
print("Total number：", len(file_list))

# combie(clolumn)
df_all = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)

df_all.to_csv('merged_2018_2021.csv', index=False)

print("output：merged_2018_2021.csv")


## 1.2. Data Preprocessing

(1) Clean trip data, correct timestamps, and remove abnormal records. 

(2) Extract date and hour features for each trip.

(3) Identify weekend and weekday trips. 

(4) Construct hourly OD (Origin–Destination) flow between stations.  

(5) Export processed OD data for further modeling.

In [None]:
import pandas as pd

# read data
df = pd.read_csv("merged_2018_2021.csv")

# Convert time fields to datetime format
df['started_at'] = pd.to_datetime(df['started_at'], utc=True, errors='coerce')
df['ended_at'] = pd.to_datetime(df['ended_at'], utc=True, errors='coerce')

# Drop rows with invalid timestamps
df = df.dropna(subset=['started_at', 'ended_at'])


#Remove trips that start and end at the same station
df = df[df['start_station_id'] != df['end_station_id']]

# Keep trips with reasonable duration:
# < 60 seconds → likely malfunction
# > 12 hours → likely forgotten to return, not valid for demand estimation
df = df[(df['duration'] > 60) & (df['duration'] < 12*3600)]


# Extract date and hour features

df['date'] = df['started_at'].dt.date
df['hour'] = df['started_at'].dt.hour
df['end_hour'] = df['ended_at'].dt.ceil('h').dt.hour

# Ensure date is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Create weekend label: 0 = Monday, 6 = Sunday
df['is_weekend'] = df['date'].dt.weekday >= 5

# Convert weekend flag to categorical string label
df['day_type'] = df['is_weekend'].map({False: '1', True: '0'})

# Construct hourly OD flow
od = df.groupby(
    ['date', 'hour', 'end_hour','start_station_id', 'end_station_id','day_type']
).size().reset_index(name='trips')

# Save the result
od.to_csv("od.csv", index=False)

od.head()

## 1.3. Zero Flated Poisson Rediction

Estimate hourly OD demand using a zero-inflated Poisson model in one day type.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

# Read 'od.csv' 
df = pd.read_csv("od.csv")

df['date'] = pd.to_datetime(df['date'])
df['start_station_id'] = df['start_station_id'].astype(str)
df['end_station_id'] = df['end_station_id'].astype(str)

#Zero-inflated Poisson prediction function
def zip_predict(row):
    pi = row['zero_rate']               # Zero-inflation rate
    lambda_c = row['count_process_rate']  # Poisson rate for the count process

    # Structural zero
    if np.random.rand() < pi:
        return 0

    # Poisson count process
    lam = max(lambda_c, 0)
    return np.random.poisson(lam)


# Repeat the ZIP process for each day_type separately
for dt in sorted(df['day_type'].unique()):
    print(f"\nday_type = {dt} ")

    df_dt = df[df['day_type'] == dt].copy()

    # Count number of days and preprocess
    n_days = df_dt['date'].nunique()
    print("days count=", n_days)

    daily_trips = df_dt.groupby(
        ['date', 'hour', 'start_station_id', 'end_station_id']
    )['trips'].sum().reset_index()

    # Aggregate statistics
    agg = daily_trips.groupby(['hour', 'start_station_id', 'end_station_id']).agg(
        total_trips=('trips', 'sum'),
        days_with_trips=('trips', lambda x: (x > 0).sum())
    ).reset_index()

    # Compute zero-related statistics
    agg['days_with_zero_trips'] = n_days - agg['days_with_trips']
    agg['zero_rate'] = agg['days_with_zero_trips'] / n_days      # π
    agg['overall_avg_trips'] = agg['total_trips'] / n_days       # λ

    # λ_c: Poisson rate for the count process
    agg['count_process_rate'] = np.where(
        agg['zero_rate'] < 1.0,
        agg['overall_avg_trips'] / (1 - agg['zero_rate']),
        0.0
    )

    #ZIP-based demand prediction
    agg['demand_avg'] = agg['overall_avg_trips']
    agg['demand_zip_poisson'] = agg.apply(zip_predict, axis=1)

    # Integer demand from average (ceiling / 0.5 rounding)
    agg['demand_up'] = agg['overall_avg_trips'].apply(
        lambda x: int(x) if x.is_integer() else int(x) + 1
    )
    agg['demand_0.5'] = agg['overall_avg_trips'].round().astype(int)

    # Export results
    out_name = f"od_hourly_zip_poisson_daytype_{dt}.csv"
    agg.to_csv(out_name, index=False)

    print(f"day_type={dt} output to: {out_name}")
    print(
        agg[
            [
                'hour',
                'start_station_id',
                'end_station_id',
                'demand_avg',
                'zero_rate',
                'demand_zip_poisson'
            ]
        ].head(10)
    )
