In [146]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# read all csv files in the folder
folder_path = 'trip_production'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

time_lag = 5

In [140]:
# change Unnamed:0 to node_id

combined_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

# reindex
combined_df['id'] = combined_df.index

combined_df.head()

Unnamed: 0,id,admin1Pcod_1,admin1Name_1,local_date_1,trip_num,tour_num,id_num,tour_traj_distance,trip_traj_distance,time_range,...,trips/person,date_1,0_14_pct,15_64_pct,65p_pct,M_pct,Population,rwi,rwi_pctl,new cases
0,0,NG009,Cross River,2020-12-05,278,134,98,4666843.0,1595685.0,5104739,...,2.836735,20201205,0.383198,0.583476,0.033325,0.509235,3958265.0,-0.187778,0.486486,0.0
1,1,NG025,Lagos,2020-12-05,27973,13760,8621,456823800.0,123158100.0,396240370,...,3.244751,20201205,0.330201,0.644711,0.025088,0.521204,14879754.0,1.118715,1.0,86.0
2,2,NG033,Rivers,2020-12-05,1841,931,697,37825140.0,12748990.0,37763147,...,2.64132,20201205,0.360718,0.606499,0.032783,0.515567,7840763.0,0.161458,0.864865,19.0
3,3,NG007,Benue,2020-12-05,202,96,74,4650858.0,1804633.0,3927735,...,2.72973,20201205,0.461757,0.507574,0.030668,0.504664,5814928.0,-0.361182,0.243243,5.0
4,4,NG014,Enugu,2020-12-05,990,454,320,32312910.0,5705347.0,15872092,...,3.09375,20201205,0.354833,0.597963,0.047204,0.48607,4415727.0,0.112638,0.783784,0.0


In [141]:
# change local_date_1 to datetime
combined_df['local_date_1'] = pd.to_datetime(combined_df['local_date_1'])


In [142]:
state_list =[]
for state, group_df in combined_df.groupby('admin1Name_1'):
    # Sort by date
    group_df = group_df.sort_values('local_date_1').copy()

    # Step 1: Clean and fill 'trip_num','trips/person', 'trip_miles/person'
    group_df['trip_num'] = group_df['trip_num'].replace([np.inf, -np.inf, ''], np.nan)
    group_df['trip_num'] = group_df['trip_num'].ffill()  # forward fill
    group_df['trips/person'] = group_df['trips/person'].replace([np.inf, -np.inf, ''], np.nan)
    group_df['trips/person'] = group_df['trips/person'].ffill()  # forward fill
    group_df['trip_miles/person'] = group_df['trip_miles/person'].replace([np.inf, -np.inf, ''], np.nan)
    group_df['trip_miles/person'] = group_df['trip_miles/person'].ffill()  # forward fill


    # Step 2: Create 'trip_num_one_day_before', 'trips_miles/person_one_day_before
    group_df['trip_num_one_day_before'] = group_df['trip_num'].shift(1)
    group_df['trips/person_one_day_before'] = group_df['trips/person'].shift(1)
    group_df['trip_miles/person_one_day_before'] = group_df['trip_miles/person'].shift(1)

    # Step 3: # Replace inf and NaN in the new column with current day's trip_num
    group_df['trip_num_one_day_before'] = (
        group_df['trip_num_one_day_before']
        .replace([float('inf'), float('-inf')], pd.NA)
        .fillna(group_df['trip_num'])
    )
    group_df['trips/person_one_day_before'] = (
        group_df['trips/person_one_day_before']
        .replace([float('inf'), float('-inf')], pd.NA)
        .fillna(group_df['trips/person'])
    )
    group_df['trip_miles/person_one_day_before'] = (
        group_df['trip_miles/person_one_day_before']
        .replace([float('inf'), float('-inf')], pd.NA)
        .fillna(group_df['trip_miles/person'])
    )

    # Step 4: Clean and fill 'new_cases'
    group_df['new cases'] = group_df['new cases'].replace([np.inf, -np.inf, ''], np.nan)
    group_df['new cases'] = group_df['new cases'].fillna(0)

    # Step 5: add new cases 5 days ago
    group_df['new cases days ago'] = group_df['new cases'].shift(time_lag)

    # Step 6: Replace inf and NaN in the new column with current day's covid cases
    group_df['new cases days ago'] = (
        group_df['new cases days ago']
        .replace([float('inf'), float('-inf')], pd.NA)
        .fillna(group_df['new cases'])
    )
    # Step 7: Put each state data in the list
    state_list.append(group_df)
    print(f"State {state}", "has been processed...")

    # Plotting the curves
    # plt.figure(figsize=(10, 4))
    # plt.plot(group_df['local_date_1'], group_df['trips/person'], label='trips/person')
    # plt.plot(group_df['local_date_1'], group_df['trips/person_one_day_before'], label='trips/person_one_day_before', linestyle='--')
    # plt.title(f"{state}: Trips/Person and One Day Before")
    # plt.xlabel("Date")
    # plt.ylabel("Trips per Person")
    # plt.legend()
    # plt.tight_layout()
    # plt.show()
new_combined_df = pd.concat(state_list, ignore_index=True)


State Abia has been processed...
State Adamawa has been processed...
State Akwa Ibom has been processed...
State Anambra has been processed...
State Bauchi has been processed...
State Bayelsa has been processed...
State Benue has been processed...
State Borno has been processed...
State Cross River has been processed...
State Delta has been processed...
State Ebonyi has been processed...
State Edo has been processed...
State Ekiti has been processed...
State Enugu has been processed...
State Federal Capital Territory has been processed...
State Gombe has been processed...
State Imo has been processed...
State Jigawa has been processed...
State Kaduna has been processed...
State Kano has been processed...
State Katsina has been processed...
State Kebbi has been processed...
State Kogi has been processed...
State Kwara has been processed...
State Lagos has been processed...
State Nasarawa has been processed...
State Niger has been processed...
State Ogun has been processed...
State Ondo 

In [143]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
reg_df = new_combined_df.copy()

reg_df = reg_df.dropna(subset=['trip_num', 'trip_num_one_day_before',
                               'new cases','new cases days ago',
                               'trips/person', 'trips/person_one_day_before',
                               'trip_miles/person', 'trip_miles/person_one_day_before',
                               'Population','0_14_pct', '15_64_pct', '65p_pct',
                               'rwi','rwi_pctl'])

# Define independent and dependent variables
X = reg_df[['trip_num_one_day_before','new cases days ago']]
y = reg_df['trip_num']

# scaler = StandardScaler()
# X = scaler.fit_transform(X_raw)
y = reg_df['trip_num']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


                                 OLS Regression Results                                
Dep. Variable:               trip_num   R-squared (uncentered):                   0.976
Model:                            OLS   Adj. R-squared (uncentered):              0.976
Method:                 Least Squares   F-statistic:                          2.699e+05
Date:                Sat, 31 May 2025   Prob (F-statistic):                        0.00
Time:                        22:50:27   Log-Likelihood:                     -1.1525e+05
No. Observations:               13542   AIC:                                  2.305e+05
Df Residuals:                   13540   BIC:                                  2.305e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

In [144]:

# Define independent and dependent variables
X = reg_df[['trips/person_one_day_before','new cases days ago']]
y = reg_df['trips/person']

# scaler = StandardScaler()
# X = scaler.fit_transform(X_raw)
y = reg_df['trips/person']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:           trips/person   R-squared (uncentered):                   0.983
Model:                            OLS   Adj. R-squared (uncentered):              0.983
Method:                 Least Squares   F-statistic:                          3.982e+05
Date:                Sat, 31 May 2025   Prob (F-statistic):                        0.00
Time:                        22:50:30   Log-Likelihood:                         -6495.6
No. Observations:               13542   AIC:                                  1.300e+04
Df Residuals:                   13540   BIC:                                  1.301e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------

In [145]:

# Define independent and dependent variables
X = reg_df[['trip_miles/person_one_day_before','new cases days ago']]
y = reg_df['trip_miles/person']

# scaler = StandardScaler()
# X = scaler.fit_transform(X_raw)
y = reg_df['trip_miles/person']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:      trip_miles/person   R-squared (uncentered):                   0.914
Model:                            OLS   Adj. R-squared (uncentered):              0.914
Method:                 Least Squares   F-statistic:                          7.231e+04
Date:                Sat, 31 May 2025   Prob (F-statistic):                        0.00
Time:                        23:01:15   Log-Likelihood:                         -59695.
No. Observations:               13542   AIC:                                  1.194e+05
Df Residuals:                   13540   BIC:                                  1.194e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                                       coef    std err          t      P>|t|      [0.025      0.975]
-------------------