目標: 
一個src，
- 可以前處理資料:
1. 補缺少的日期、hr (不含weekday)

- 可以在特定weekday底下，將所有站別分群，分群的邏輯如下:
1. 計算特定站別在每個weekday、每個Hr底下的出租量的Pr50，然後計算其三小時的移動平均
2. 第1點的移動平均建一個線性回歸，看其租借量的趨勢，得到slope & intersection
3. 上述的slope & intersect作為分群的依據


In [57]:
import pandas as pd
import numpy as np
from dataclasses import asdict, dataclass, field
from typing import Optional, List, Union
import tqdm
# plot
import matplotlib.pyplot as plt
import seaborn as sns
# model
from scipy import stats
from sklearn import cluster


In [None]:
RENT_COLS = ['station', 'Generation', 'sno', 'time', 'date', 'Hr', 'rent_count', 'weekday']
WEEKDAYS = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data = pd.read_csv('./data/df_Final.csv', usecols=RENT_COLS)
data['date'] = pd.to_datetime(data['date'])
data_rentCnt = data.pivot_table(
    index=['station', 'sno', 'date', 'weekday'],
    columns='Hr',
    values='rent_count',
    aggfunc='sum',
).reset_index()

In [26]:
class StationHrDes:
    COLS_REQ = ['station', 'sno', 'date', 'weekday'] + [i for i in range(24)]
    
    def __init__(self, df_station: Optional[pd.DataFrame], sno: Optional[int] = None, hr_ma_windows=3):
        assert len(col_miss := (set(self.COLS_REQ) - set(df_station.columns))) == 0, f"columns requirements: 'station', 'sno', 'date', 'weekday', 0,1,2,...,23 (hr); Missing cols: {col_miss}"
        # 計算每個weekday、hr的逐三小時移動平均
        self.MA_hr_pr50 = self.fill_miss_days(df_station)\
            .groupby('weekday')\
            .apply(lambda x: self.weekday_hr_ma(x, windows=hr_ma_windows))
        #照星期順序排
        self.MA_hr_pr50 = self.MA_hr_pr50.loc[[w for w in self.WEEKDAYS if w in self.MA_hr_pr50.index]]

        self.lr_des = self.lr_des_perttyFormat(self.MA_hr_pr50.apply(self.scipy_lr, axis=1)).assign(sno=sno)

        self.sno = sno

    def fill_miss_days(self, df):
        """
        輸入一個df，依據'date'欄位找出缺漏的日期，為該日期的租借量補上nan
        """
        d_min, d_max = df['date'].apply(['min', 'max'])
        days = pd.Series(index=pd.date_range(d_min, d_max, freq='d'), dtype=float)
        ans = pd.concat([df.set_index('date'), days], axis=1).iloc[:, :-1]
        ans.fillna(0, inplace=True)
        ans.index.name = 'date'
        return ans

    def weekday_hr_ma(self, df, windows=3, enough_samples=5):
        """
        輸入只有一個weekday的subset。回傳各小時的出租量的Pr50的近三小時移動平均。
        目標是用來判斷是否為高峰。
        - 如果某小時的樣本少於enough_samples(default=5)就設為nan
        """
        hr_Pr50 = df[np.arange(24)].quantile(.5)
        hr_cnt = (~df[np.arange(24)].isna()).sum()
        ans = pd.concat([hr_Pr50.iloc[-2:], hr_Pr50])\
                .rolling(windows)\
                .mean()\
                .iloc[2:] # 0點的移動平均要從10, 11, 12計算
        enough_mask = hr_cnt < enough_samples  # 如果樣本少於enough_samples(default=5)就不採用
        ans.loc[enough_mask] = 0
        ans.index.name = 'weekday'
        return ans

    def scipy_lr(self, ser, return_line=False):
        slope, intersect, rscore, pvalue, serr = stats.linregress(np.arange(len(ser)), ser)
        if not return_line:
            return slope, intersect
        return np.arange(len(ser)) * slope + intersect

    def lr_des_perttyFormat(self, lr_des):
        return pd.DataFrame([[i[0] for i in lr_des], [i[1] for i in lr_des]], columns=lr_des.index, index=['slope', 'intersect']).T


In [61]:
@dataclass
class StationsLrHolder:
    stations_info: List[StationHrDes] = field(default_factory=list)
    LR_data: Optional[pd.DataFrame] = None

    def add_one_station(self, des: Union[pd.DataFrame, StationHrDes], sno: Optional[int] = None) -> None:
        if isinstance(des, pd.DataFrame):
            self.stations_info.append(self.df_2_StationHrDes(des, sno))
        elif isinstance(des, StationHrDes):
            self.stations_info.append(des)
        else:
            raise TypeError("wrong type of des, it should be either pd.DataFrame or StationHrDes.")

    def df_2_StationHrDes(self, df_station: pd.DataFrame, sno: Optional[int] = None) -> StationHrDes:
        return StationHrDes(df_station, sno)

    def update_LR_data(self) -> None:
        holder = []
        for info in self.stations_info:
            holder.append(info.lr_des)
        self.LR_data = pd.concat(holder).reset_index()


def weekday_classification(LR_data: pd.DataFrame, weekday: str, n_cluster=5, return_model=False):
    if set(['slope', 'intersect', 'weekday']) - set(LR_data.columns):
        raise KeyError("'slope', 'intersect', 'weekday should be in LR_data.")
    sub = LR_data.query(" weekday==@weekday ")
    m = cluster.KMeans(n_cluster)
    sub_std = sub[['slope', 'intersect']].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
    labels = m.fit_predict(sub_std[['slope', 'intersect']].values)
    ans: pd.Series = pd.Series(labels, index=sub['sno'])
    if return_model:
        return ans, m
    return ans

In [52]:
stations = StationsLrHolder()
iters = tqdm.tqdm(data_rentCnt.groupby('sno'), total=data_rentCnt['sno'].nunique())
for k, g in iters:
    stations.add_one_station(g, sno=k)   
stations.update_LR_data()
stations.LR_data.query(" weekday=='Monday' ")

100%|██████████| 1418/1418 [00:26<00:00, 52.83it/s]


In [None]:
WEEKDAYS

In [65]:
weekday_classification(stations.LR_data, weekday='Monday')


sno
1            4
2            2
4            0
5            2
6            3
            ..
500119086    0
500119087    0
500119088    0
500119089    0
500119090    3
Length: 1410, dtype: int32

In [66]:
stations.LR_data

Unnamed: 0,weekday,slope,intersect,sno
0,Monday,2.626304,1.985000,1
1,Tuesday,2.769565,1.358333,1
2,Wednesday,2.692174,2.248333,1
3,Thursday,2.520870,2.801667,1
4,Friday,2.768913,2.553333,1
...,...,...,...,...
9894,Wednesday,0.546522,-0.160000,500119090
9895,Thursday,0.489348,-0.148333,500119090
9896,Friday,0.425217,0.026667,500119090
9897,Saturday,0.293478,-0.208333,500119090
