In [1]:
# Classification model code
import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
#from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_parquet('model_data_1.parquet')

In [3]:

df.head()


Unnamed: 0,locationid,ridership,time,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°)
0,2.0,1.0,2022-02-06 15:00:00,-5.6,0.0,0.0,11.0,0.0,0.0,35.0,6.6,331.0
1,2.0,1.0,2022-02-16 23:00:00,5.4,0.0,0.0,29.0,0.0,0.0,97.0,12.1,280.0
2,2.0,1.0,2022-02-23 08:00:00,13.5,0.0,0.0,44.0,8.0,11.0,100.0,16.299999,227.0
3,2.0,4.0,2022-02-23 19:00:00,17.5,0.0,0.0,22.0,5.0,2.0,54.0,21.6,184.0
4,2.0,1.0,2022-03-22 10:00:00,7.4,0.0,0.0,20.0,0.0,30.0,8.0,10.8,127.0


In [4]:

categorical_columns = df[['locationid']].columns

# Convert data type to int for these columns - gets rid of decimal place
for column in categorical_columns:
    df[column] = df[column].astype('int')
    

integer_columns = df[['ridership']].columns

# Convert data type to object for these columns
for column in integer_columns:
    df[column] = df[column].astype('int')
    

In [5]:

categorical_columns = df[['locationid']].columns

# Convert data type to category for these columns
for column in categorical_columns:
    df[column] = df[column].astype('category')
    

In [6]:

# Check the data types in each column as assigned by default.
df.dtypes


locationid                     category
ridership                         int64
time                     datetime64[ns]
temperature_2m (°C)             float32
precipitation (mm)              float32
rain (mm)                       float32
cloudcover (%)                  float32
cloudcover_low (%)              float32
cloudcover_mid (%)              float32
cloudcover_high (%)             float32
windspeed_10m (km/h)            float32
winddirection_10m (°)           float32
dtype: object

In [7]:

# Check just how many rows:
df.shape


(2143060, 12)

In [8]:

df = df.drop(columns=['winddirection_10m (°)', 'windspeed_10m (km/h)', 'cloudcover_high (%)',
                      'cloudcover_mid (%)', 'cloudcover_low (%)', 'precipitation (mm)'])

df.head()


Unnamed: 0,locationid,ridership,time,temperature_2m (°C),rain (mm),cloudcover (%)
0,2,1,2022-02-06 15:00:00,-5.6,0.0,11.0
1,2,1,2022-02-16 23:00:00,5.4,0.0,29.0
2,2,1,2022-02-23 08:00:00,13.5,0.0,44.0
3,2,4,2022-02-23 19:00:00,17.5,0.0,22.0
4,2,1,2022-03-22 10:00:00,7.4,0.0,20.0


In [9]:

df.isnull().sum()


locationid                  0
ridership                   0
time                   634050
temperature_2m (°C)    634050
rain (mm)              634050
cloudcover (%)         634050
dtype: int64

In [10]:

df=df.dropna()


In [11]:

df.isnull().sum()


locationid             0
ridership              0
time                   0
temperature_2m (°C)    0
rain (mm)              0
cloudcover (%)         0
dtype: int64

In [12]:

df.shape


(1509010, 6)

In [13]:

# Filter the dataframe to keep only the rows between 6pm to 6am
df = df[(df['time'].dt.hour < 6) | (df['time'].dt.hour >= 18)]


In [14]:

df.shape


(755673, 6)

In [15]:
df.head()

Unnamed: 0,locationid,ridership,time,temperature_2m (°C),rain (mm),cloudcover (%)
1,2,1,2022-02-16 23:00:00,5.4,0.0,29.0
3,2,4,2022-02-23 19:00:00,17.5,0.0,22.0
5,2,1,2022-03-28 23:00:00,-2.0,0.0,5.0
12,2,1,2022-05-27 01:00:00,19.1,0.0,100.0
14,2,1,2022-05-29 21:00:00,24.6,0.0,100.0


In [16]:

df.to_parquet('night_data.parquet')


In [17]:

df = pd.read_parquet('night_data.parquet')


In [18]:

df.dtypes


locationid                      int64
ridership                       int64
time                   datetime64[ns]
temperature_2m (°C)           float32
rain (mm)                     float32
cloudcover (%)                float32
dtype: object

In [19]:

# Find the minimum and maximum ridership values
print("Minimum ridership: ", df['ridership'].min())
print("Maximum ridership: ", df['ridership'].max())

max_ridership = df['ridership'].max()


Minimum ridership:  0
Maximum ridership:  45738


In [20]:

# Count the number of rows in each specified range
#ranges = {
    #'Quiet': (0, 999),
    #'Not Too Busy': (1000, 4999),
    #'A Little Busy': (5000, 9999),
    #'Busy': (10000, 19999),
    #'Very Busy': (20000, 49999),
    #'Extremely Busy': (50000, 74080)
    #}

# Define ranges using quantiles to get more balanced distribution
quantiles = df['ridership'].quantile([0, 0.2, 0.4, 0.6, 0.8, 0.99]).values

# Max ridership is an outlier and without the following code was the only value in extremely busy
quantiles = list(quantiles) + [max_ridership]
    
# Convert quantile values to integers to get rid of decimal places
quantiles = [int(q) for q in quantiles]

# Define ranges based on quantiles
# minus 1 at the end of each quantile to avoid overlap, e.g. Quiet was 0-23 and not too busy was 23-143, meaning both contained 23
ranges = {
    'Quiet': (quantiles[0], quantiles[1]-1),
    'Not Too Busy': (quantiles[1], quantiles[2]-1),
    'A Little Busy': (quantiles[2], quantiles[3]-1),
    'Busy': (quantiles[3], quantiles[4]-1),
    'Very Busy': (quantiles[4], quantiles[5]-1),
    'Extremely Busy': (quantiles[5], quantiles[6])
    }
    
range_counts = {}
for label, (lower, upper) in ranges.items():
    count = ((df['ridership'] >= lower) & (df['ridership'] <= upper)).sum()
    range_counts[label] = count
    
print("Number of entries in each ridership range:")
for label, count in range_counts.items():
    print(f"{label}: {count}")
    
print("\n")    
print("Ranges used for each category:")
for label, (lower, upper) in ranges.items():
    print(f"{label}: {lower} - {upper}")
    

Number of entries in each ridership range:
Quiet: 147924
Not Too Busy: 152281
A Little Busy: 153020
Busy: 151146
Very Busy: 143743
Extremely Busy: 7559


Ranges used for each category:
Quiet: 0 - 9
Not Too Busy: 10 - 51
A Little Busy: 52 - 153
Busy: 154 - 517
Very Busy: 518 - 8843
Extremely Busy: 8844 - 45738


In [21]:

# Assign busyness label based on ridership value
def assign_label(ridership):
    if 0 <= ridership <= 9:
        return 'Quiet'
    elif 10 <= ridership <= 51:
        return 'Not Too Busy'
    elif 52 <= ridership <= 153:
        return 'A Little Busy'
    elif 154 <= ridership <= 517:
        return 'Busy'
    elif 518 <= ridership <= 8843:
        return 'Very Busy'
    elif 8844 <= ridership <= 45738:
        return 'Extremely Busy'
    else:
        raise ValueError(f"Ridership value {ridership} cannot be assigned a category.")

# Create new column 'busyness' assigning label to each ridership value
try:
    df['busyness'] = df['ridership'].apply(assign_label)
    print("Label assignment successful.")
except ValueError as e:
    print(f"Error: {str(e)}")

# Display the updated DataFrame with the new column
print(df.tail(30))


Label assignment successful.
         locationid  ridership                time  temperature_2m (°C)  \
2139383         263       1535 2022-12-29 18:00:00                  3.4   
2139384         263       1028 2022-12-29 19:00:00                  3.4   
2139385         263        833 2022-12-29 20:00:00                  3.5   
2139386         263        690 2022-12-29 21:00:00                  3.2   
2139387         263        603 2022-12-29 22:00:00                  1.6   
2139388         263        420 2022-12-29 23:00:00                  1.2   
2139389         263        306 2022-12-30 00:00:00                  0.8   
2139390         263        144 2022-12-30 01:00:00                  0.2   
2139391         263        124 2022-12-30 02:00:00                 -0.2   
2139392         263         60 2022-12-30 03:00:00                 -1.1   
2139393         263         71 2022-12-30 04:00:00                 -1.5   
2139394         263         53 2022-12-30 05:00:00                 -1.8

In [22]:

df.to_parquet('night_data1.parquet')


In [23]:

df = pd.read_parquet('night_data1.parquet')

df.head(10)


Unnamed: 0,locationid,ridership,time,temperature_2m (°C),rain (mm),cloudcover (%),busyness
1,2,1,2022-02-16 23:00:00,5.4,0.0,29.0,Quiet
3,2,4,2022-02-23 19:00:00,17.5,0.0,22.0,Quiet
5,2,1,2022-03-28 23:00:00,-2.0,0.0,5.0,Quiet
12,2,1,2022-05-27 01:00:00,19.1,0.0,100.0,Quiet
14,2,1,2022-05-29 21:00:00,24.6,0.0,100.0,Quiet
17,2,1,2022-06-13 19:00:00,26.799999,0.1,28.0,Quiet
18,2,1,2022-06-16 22:00:00,21.6,0.0,99.0,Quiet
19,2,1,2022-06-18 23:00:00,17.700001,0.0,56.0,Quiet
22,2,1,2022-08-16 00:00:00,23.6,0.0,15.0,Quiet
23,2,1,2022-08-17 23:00:00,25.299999,0.0,49.0,Quiet


In [24]:
df.shape

(755673, 7)

In [25]:

df.dtypes


locationid                      int64
ridership                       int64
time                   datetime64[ns]
temperature_2m (°C)           float32
rain (mm)                     float32
cloudcover (%)                float32
busyness                       object
dtype: object

In [26]:

categorical_columns = df[['locationid', 'busyness']].columns

# Convert data type to category for these columns
for column in categorical_columns:
    df[column] = df[column].astype('category')
    

In [27]:

df.dtypes



locationid                   category
ridership                       int64
time                   datetime64[ns]
temperature_2m (°C)           float32
rain (mm)                     float32
cloudcover (%)                float32
busyness                     category
dtype: object

# Models

- First need to extract the data needed from datetime, i.e. day, month, year, hour.
- Then encode busyness column as numeric

In [28]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [29]:

# Extract features from datetime
df['hour'] = df['time'].dt.hour
df['day'] = df['time'].dt.day
df['month'] = df['time'].dt.month
df['weekday'] = df['time'].dt.weekday


In [30]:

df.head()


Unnamed: 0,locationid,ridership,time,temperature_2m (°C),rain (mm),cloudcover (%),busyness,hour,day,month,weekday
1,2,1,2022-02-16 23:00:00,5.4,0.0,29.0,Quiet,23,16,2,2
3,2,4,2022-02-23 19:00:00,17.5,0.0,22.0,Quiet,19,23,2,2
5,2,1,2022-03-28 23:00:00,-2.0,0.0,5.0,Quiet,23,28,3,0
12,2,1,2022-05-27 01:00:00,19.1,0.0,100.0,Quiet,1,27,5,4
14,2,1,2022-05-29 21:00:00,24.6,0.0,100.0,Quiet,21,29,5,6


In [31]:

# Drop time column
df = df.drop(columns=['ridership', 'temperature_2m (°C)', 'rain (mm)', 'cloudcover (%)',])


In [32]:

df = df.drop(columns=['time'])


In [33]:

# Encode busyness
le = LabelEncoder()
df['busyness'] = le.fit_transform(df['busyness'])


In [34]:

# Target feature
y = df['busyness']
y.head()


1     4
3     4
5     4
12    4
14    4
Name: busyness, dtype: int64

In [35]:

# Set all remaining features as X, remove y from this subset
X = df.drop(columns=['busyness'])
X.head()


Unnamed: 0,locationid,hour,day,month,weekday
1,2,23,16,2,2
3,2,19,23,2,2
5,2,23,28,3,0
12,2,1,27,5,4
14,2,21,29,5,6


In [36]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)


In [37]:

# Initialize and train the model
model = RandomForestClassifier(random_state=2)
model.fit(X_train, y_train)


In [38]:

# Make predictions
y_pred = model.predict(X_test)


In [39]:

from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Example: Classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.5843089165512435
              precision    recall  f1-score   support

           0       0.52      0.52      0.52     45994
           1       0.57      0.57      0.57     45412
           2       0.69      0.59      0.63      2251
           3       0.56      0.57      0.57     45578
           4       0.60      0.57      0.58     44309
           5       0.66      0.70      0.68     43158

    accuracy                           0.58    226702
   macro avg       0.60      0.59      0.59    226702
weighted avg       0.58      0.58      0.58    226702



In [40]:

from datetime import datetime, timedelta
import pytz


In [41]:

# New York timezone
ny_tz = pytz.timezone('America/New_York')


In [42]:

# Define a mapping dictionary for busyness labels
label_mapping = {
    0: 'Quiet',
    1: 'Not Busy',
    2: 'A Little Busy',
    3: 'Busy',
    4: 'Very Busy',
    5: 'Extremely Busy'
}

In [43]:

def predict_busyness(locationid, model):
    # Current datetime - New York
    now = datetime.now(ny_tz)
    
    # Datetime for 1 hour from now
    future_time = now + timedelta(hours=1)
    
    # Extract hour, day of week, and month - values model is trained on
    hour = future_time.hour
    day = future_time.day
    month = future_time.month
    weekday = future_time.weekday()
    
    # Input features
    input_data = pd.DataFrame({
        'locationid': [locationid],
        'hour': [hour],
        'day': [day],
        'month': [month],
        'weekday': [weekday]
    })
    
    input_data['locationid'] = input_data['locationid'].astype('category')
    input_data['day'] = input_data['day'].astype('category')
    input_data['month'] = input_data['month'].astype('category')
    input_data['weekday'] = input_data['weekday'].astype('category')
    
    
    prediction = model.predict(input_data)
    
    # numeric prediction to its corresponding busyness label 
    predicted_label = label_mapping[prediction[0]]
    
    return predicted_label



In [44]:

# Input data - sample
location_id = 6

predicted_busyness = predict_busyness(location_id, model)
print(f"Predicted busyness at location {location_id} 1 hour from now in New York: {predicted_busyness}")


Predicted busyness at location 6 1 hour from now in New York: Very Busy


In [48]:

# Import pickle module
import pickle

# Write to file
with open('RF_m1.pkl', 'wb') as file:
    pickle.dump(model, file)