In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics


# SPLIT THE DATA TO GET 4 PARTS OF CSV

In [4]:
dtype={'RatecodeID': 'float64',
       'VendorID': 'float64',
       'passenger_count': 'float64',
       'payment_type': 'float64',
       'tolls_amount': 'float64',
       'total_amount': 'object',
       'fare_amount': 'object',
       'tip_amount': 'object',
       'trip_distance': 'object',
       'extra': 'object',
       'mta_tax': 'object'
      }

df = dd.read_csv('2020_Yellow_Taxi_Trip_Data.csv', low_memory=False, dtype=dtype)

# Parse 'tpep_pickup_datetime' into a datetime object
df['tpep_pickup_datetime'] = dd.to_datetime(df['tpep_pickup_datetime'], format="%m/%d/%Y %I:%M:%S %p")
df['tpep_dropoff_datetime'] = dd.to_datetime(df['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

# Create a new column for the quarter based on 'tpep_pickup_datetime'
df['quarter'] = df['tpep_pickup_datetime'].dt.quarter

# Define the quarters
quarters = [1, 2, 3, 4]

# Loop over each quarter
for i, quarter in enumerate(quarters, 1):
    # Filter the dataframe by the quarter
    df_quarter = df[df['quarter'] == quarter]
    
    # Compute the quarter dataframe and write it to a CSV file
    df_quarter.compute().to_csv(f'yellow_taxi_data_Q{i}.csv', index=False)

FileNotFoundError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: [WinError 2] The system cannot find the file specified: 'C:/Users/tfiro/Desktop/Computer Science/smart_city_explorer/data_models/Zone_busyness_model/2020_Yellow_Taxi_Trip_Data.csv'

# Data Cleaning
## RUN FOR EACH SPLIT OF DATA

In [73]:
df = pd.read_csv('yellow_taxi_data_Q4.csv')

  df = pd.read_csv('yellow_taxi_data_Q4.csv')


In [74]:
num_rows, num_columns = df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 4652240
Number of columns: 19


### Filter data of Year 2020

In [75]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

# Filter the rows where the year is 2020
df = df[df['tpep_pickup_datetime'].dt.year == 2020]

df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Filter the rows where the year is 2020
df = df[df['tpep_dropoff_datetime'].dt.year == 2020]

### Only keep values for zone in Manhattan

In [76]:
df_zone_data = pd.read_csv('manhattan_zones.csv')
df_zone_data.head()

df_valid_zone = df_zone_data['LocationID'].unique()


In [77]:
df['PULocationID'] = df['PULocationID'].astype(int)
df['DOLocationID'] = df['DOLocationID'].astype(int)

# Filter the rows where "PULocationID" or "DOLocationID" is not in the zone_array
df = df[df['PULocationID'].isin(df_valid_zone) & df['DOLocationID'].isin(df_valid_zone)]

empty_pu_location_ids = df['PULocationID'].isnull()
# Get rows with empty PULocationID values
rows_with_empty_pu_location = df[empty_pu_location_ids]
rows_with_empty_pu_location

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,quarter


### Look for Duplicates

In [78]:
# look for duplicates
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

# use "keep=False" to mark all duplicates as true, including the original rows that were duplicated
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


In [79]:
df = df.drop_duplicates()

In [80]:
df.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                    object
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int32
DOLocationID                      int32
payment_type                    float64
fare_amount                      object
extra                           float64
mta_tax                         float64
tip_amount                       object
tolls_amount                    float64
improvement_surcharge           float64
total_amount                     object
congestion_surcharge            float64
quarter                           int64
dtype: object

### Check if Pickup time is after Drop off time and filter out such data

In [81]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Filter rows where pickup time is before or equal to drop time
df = df[df['tpep_pickup_datetime'] <= df['tpep_dropoff_datetime']]

### Filter Passenger grester than 0

In [82]:
df = df.dropna(subset=['passenger_count'])

# Drop rows with 0 in the 'passengers' column
df = df[df['passenger_count'] > 0]

### Filter Trip distance greater than 0

In [83]:
df = df.dropna(subset=['trip_distance'])

df = df[df['trip_distance'] !=0]

### Convert All negative values to positive values and convert string to float

In [84]:
columns_to_process = ['trip_distance', 'fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge']

for col in columns_to_process:
    # Check if the value is in string format
    is_string = df[col].apply(lambda x: isinstance(x, str))

    # Convert string values to float and take the absolute value
    df[col] = df[col].where(~is_string, other=pd.to_numeric(df[col], errors='coerce')).abs()

In [85]:
df.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                    object
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int32
DOLocationID                      int32
payment_type                    float64
fare_amount                      object
extra                           float64
mta_tax                         float64
tip_amount                       object
tolls_amount                    float64
improvement_surcharge           float64
total_amount                     object
congestion_surcharge            float64
quarter                           int64
dtype: object

In [86]:
df['total_amount'] = df['total_amount'].astype('float64')
df['tip_amount'] = df['tip_amount'].astype('float64')
df['mta_tax'] = df['mta_tax'].astype('float64')
df['extra'] = df['extra'].astype('float64')
df['fare_amount'] = df['fare_amount'].astype('float64')
df['trip_distance'] = df['trip_distance'].astype('float64')

In [87]:
df.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int32
DOLocationID                      int32
payment_type                    float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
quarter                           int64
dtype: object

In [88]:
num_rows, num_columns = df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 3810504
Number of columns: 19


In [89]:
df.to_csv('yellow_taxi_data_Q4_Cleaned.csv', index=False)

# Data Processing

In [90]:
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])


df['drop_time'] = df['tpep_dropoff_datetime'].dt.strftime('%Y-%m-%d %H')

df['pick_time'] = df['tpep_pickup_datetime'].dt.strftime('%Y-%m-%d %H')


In [91]:
df['drop_time'] = pd.to_datetime(df['drop_time'])

df['hour'] = df['drop_time'].dt.hour

# Create the 'day_of_week' column with the day of the week (Monday is 0, Tuesday is 1, etc.)
df['day_of_week'] = df['drop_time'].dt.dayofweek

df['drop_time'] = df['drop_time'].dt.strftime('%Y-%m-%d %H')

In [92]:
# Group the data by 'DOLocationID', 'dropoff_hour', and 'dropoff_day_of_week'
# Count the number of drop-offs for each group
popular_locations_drop = df.groupby(['DOLocationID', 'hour', 'day_of_week', 'drop_time']).size().reset_index(name='dropoff_count')

# Display the DataFrame sorted by 'dropoff_count' in descending order
popular_locations_drop = popular_locations_drop.sort_values(by='dropoff_count', ascending=False)

popular_locations_drop.head(50)

Unnamed: 0,DOLocationID,hour,day_of_week,drop_time,dropoff_count
103468,236,14,2,2020-11-11 14,401
103561,236,15,2,2020-11-18 15,393
103575,236,15,3,2020-11-19 15,377
103552,236,15,1,2020-12-15 15,371
103456,236,14,1,2020-11-17 14,359
103547,236,15,1,2020-11-10 15,358
103574,236,15,3,2020-11-12 15,356
103571,236,15,3,2020-10-22 15,349
103572,236,15,3,2020-10-29 15,349
103551,236,15,1,2020-12-08 15,347


In [93]:
# Group the data by 'PULocationID', 'pickup_hour', and 'pickup_day_of_week'
# Count the number of pickups for each group
popular_locations_pick = df.groupby(['PULocationID', 'hour', 'day_of_week', 'pick_time']).size().reset_index(name='pickup_count')

# Display the DataFrame sorted by 'pickup_count' in descending order
popular_locations_pick = popular_locations_pick.sort_values(by='pickup_count', ascending=False)

popular_locations_pick.head(50)



Unnamed: 0,PULocationID,hour,day_of_week,pick_time,pickup_count
174663,237,14,2,2020-11-11 14,332
175070,237,16,1,2020-11-17 16,316
175085,237,16,2,2020-10-07 16,311
170487,236,14,2,2020-11-11 14,310
170744,236,15,3,2020-11-19 15,306
174640,237,14,1,2020-12-08 14,301
170896,236,16,1,2020-11-17 16,300
170688,236,15,1,2020-12-15 15,299
174642,237,14,1,2020-12-15 14,298
170710,236,15,2,2020-11-18 15,297


In [94]:
# Assuming you have two DataFrames named 'pickup_table' and 'dropoff_table'

# Perform a merge (join) on the common columns
merged_df = pd.merge(popular_locations_pick, popular_locations_drop, 
                     left_on=['PULocationID', 'hour', 'day_of_week', 'pick_time'], 
                     right_on=['DOLocationID', 'hour', 'day_of_week','drop_time'], 
                     suffixes=('_pickup', '_dropoff'))

# Add the 'pickup_count' and 'dropoff_count' columns to get the total count
merged_df['total_count'] = merged_df['pickup_count'] + merged_df['dropoff_count']

# Drop the duplicate columns (if needed)

# Rename the columns (if needed)
# merged_df.rename(columns={'PULocationID': 'LocationID', 'pickup_hour': 'hour', 'pickup_day_of_week': 'day_of_week'}, inplace=True)
merged_df = merged_df.sort_values(by='total_count', ascending=False)
# Display the final merged DataFrame
print(merged_df.head(20))

     PULocationID  hour  day_of_week      pick_time  pickup_count  \
3             236    14            2  2020-11-11 14           310   
9             236    15            2  2020-11-18 15           297   
4             236    15            3  2020-11-19 15           306   
7             236    15            1  2020-12-15 15           299   
0             237    14            2  2020-11-11 14           332   
10            236    15            3  2020-11-12 15           296   
14            236    15            1  2020-12-08 15           288   
22            236    15            3  2020-10-29 15           281   
19            236    15            2  2020-11-11 15           282   
13            236    15            3  2020-12-10 15           292   
21            236    16            2  2020-10-07 16           281   
8             237    14            1  2020-12-15 14           298   
61            236    14            1  2020-11-17 14           260   
6             236    16           

In [95]:
# Rename columns 'PULocationID' to 'LocationID' and 'pick_time' to 'time'
merged_df = merged_df.rename(columns={'PULocationID': 'LocationID', 'pick_time': 'time'})

# Drop columns 'DOLocationID' and 'drop_time'
merged_df = merged_df.drop(columns=['DOLocationID', 'drop_time'])


In [98]:
merged_df = merged_df.dropna(subset=['total_count'])

In [100]:
# Step 1: Calculate the maximum values for each 'LocationID' group
max_values = merged_df.groupby('LocationID')['total_count'].max()

# Step 2: Merge the maximum values back to the original DataFrame using 'LocationID' as the key
merged_df = merged_df.merge(max_values.rename('max_count'), left_on='LocationID', right_index=True)

# Step 3: Calculate the percentage relative to the maximum value for each zone and add it as a new column 'busyness'
merged_df['busyness'] = (merged_df['total_count'] / merged_df['max_count']) * 100

# Step 4: Sort the DataFrame if needed (optional)
merged_df = merged_df.sort_values(['LocationID', 'total_count'], ascending=[True, False])
merged_df = merged_df.drop(columns=['max_count_x', 'max_count_y'])
# Step 5: Print the resulting DataFrame
print(merged_df.head(50))

       LocationID  hour  day_of_week           time  pickup_count  \
68915           4    16            3  2020-12-24 16             8   
84376           4    18            5  2020-11-07 18             4   
95531           4    17            4  2020-10-30 17             2   
95009           4    19            4  2020-10-02 19             2   
95004           4    19            4  2020-10-30 19             2   
94909           4    18            1  2020-11-03 18             2   
95536           4    17            3  2020-12-24 17             2   
76475           4    15            2  2020-11-18 15             5   
89791           4    17            4  2020-12-04 17             3   
95539           4    17            3  2020-11-12 17             2   
64500           4     1            6  2020-11-01 01             9   
83391           4    19            5  2020-10-17 19             4   
89808           4    18            3  2020-11-19 18             3   
78675           4    19           

# Importing Weather Data

In [101]:
df_weather = pd.read_csv('weather2020.csv')
df_weather.head()

Unnamed: 0,time,temperature_2m (°C),apparent_temperature (°C),precipitation (mm),weathercode (wmo code),windspeed_10m (km/h)
0,2020-01-01T00:00,2.8,-0.5,0.0,2,9.2
1,2020-01-01T01:00,2.6,-0.9,0.0,3,9.7
2,2020-01-01T02:00,3.3,-0.4,0.0,3,11.4
3,2020-01-01T03:00,3.5,-0.5,0.0,2,12.3
4,2020-01-01T04:00,2.9,-1.5,0.0,1,13.4


In [102]:
# look for duplicates
print('Number of duplicate (excluding first) rows in the table is: ', df_weather.duplicated().sum())

Number of duplicate (excluding first) rows in the table is:  0


In [103]:
# Convert the 'tpep_dropoff_datetime' column to datetime format with a specific format
df_weather['time'] = pd.to_datetime(df_weather['time'], format='%Y-%m-%dT%H:%M')
# Extract the hour of the day and day of the week
df_weather['time'] = df_weather['time'].dt.strftime('%Y-%m-%d %H')

df_weather.head()

Unnamed: 0,time,temperature_2m (°C),apparent_temperature (°C),precipitation (mm),weathercode (wmo code),windspeed_10m (km/h)
0,2020-01-01 00,2.8,-0.5,0.0,2,9.2
1,2020-01-01 01,2.6,-0.9,0.0,3,9.7
2,2020-01-01 02,3.3,-0.4,0.0,3,11.4
3,2020-01-01 03,3.5,-0.5,0.0,2,12.3
4,2020-01-01 04,2.9,-1.5,0.0,1,13.4


In [104]:
df_manhattan_2020 = pd.merge(merged_df, df_weather, on='time', suffixes=('_busyness', '_weather'))

df_manhattan_2020.head(50)

Unnamed: 0,LocationID,hour,day_of_week,time,pickup_count,dropoff_count,total_count,busyness,max_count,temperature_2m (°C),apparent_temperature (°C),precipitation (mm),weathercode (wmo code),windspeed_10m (km/h)
0,4,16,3,2020-12-24 16,8,31,39,100.0,39,11.0,6.8,0.0,3,23.8
1,13,16,3,2020-12-24 16,8,19,27,47.368421,57,11.0,6.8,0.0,3,23.8
2,24,16,3,2020-12-24 16,18,36,54,93.103448,58,11.0,6.8,0.0,3,23.8
3,41,16,3,2020-12-24 16,32,53,85,62.5,136,11.0,6.8,0.0,3,23.8
4,42,16,3,2020-12-24 16,12,42,54,59.340659,91,11.0,6.8,0.0,3,23.8
5,43,16,3,2020-12-24 16,114,49,163,72.123894,226,11.0,6.8,0.0,3,23.8
6,45,16,3,2020-12-24 16,1,5,6,30.0,20,11.0,6.8,0.0,3,23.8
7,48,16,3,2020-12-24 16,81,114,195,84.051724,232,11.0,6.8,0.0,3,23.8
8,50,16,3,2020-12-24 16,22,53,75,78.125,96,11.0,6.8,0.0,3,23.8
9,68,16,3,2020-12-24 16,70,68,138,62.443439,221,11.0,6.8,0.0,3,23.8


In [105]:
df_manhattan_2020.dtypes

LocationID                     int64
hour                           int64
day_of_week                    int64
time                          object
pickup_count                   int64
dropoff_count                  int64
total_count                    int64
busyness                     float64
max_count                      int64
temperature_2m (°C)          float64
apparent_temperature (°C)    float64
precipitation (mm)           float64
weathercode (wmo code)         int64
windspeed_10m (km/h)         float64
dtype: object

In [106]:
df_manhattan_2020.rename(columns={'temperature_2m (°C)': 'temperature', 'apparent_temperature (°C)': 'apparent_temperature', 'precipitation (mm)': 'precipitation', 'weathercode (wmo code)': 'weathercode', 'windspeed_10m (km/h)': 'windspeed_10m'}, inplace=True)

In [107]:
df_manhattan_2020.to_csv('yellow_taxi_data_Q4_training_data_Processed.csv', index=False)


(108431, 14)

# Analysis Of Data

In [5]:
# Merge all the CSV

csv_files = ['yellow_taxi_data_Q1_training_data_Processed.csv', 'yellow_taxi_data_Q2_training_data_Processed.csv', 'yellow_taxi_data_Q3_training_data_Processed.csv', 'yellow_taxi_data_Q4_training_data_Processed.csv']

dataframes = [pd.read_csv(file) for file in csv_files]

# Concatenate the DataFrames
df_manhattan_2020 = pd.concat(dataframes, ignore_index=True)

In [6]:
# Merge all the CSV

csv_files = ['yellow_taxi_data_Q1_training_data_Processed.csv', 'yellow_taxi_data_Q2_training_data_Processed.csv', 'yellow_taxi_data_Q3_training_data_Processed.csv', 'yellow_taxi_data_Q4_training_data_Processed.csv']

dataframes = [pd.read_csv(file) for file in csv_files]

# Concatenate the DataFrames
df_train = pd.concat(dataframes, ignore_index=True)
df_train.sort_values(['busyness','time']).head(20)
df_train.shape

(416225, 14)

In [7]:
df_manhattan_2020.corr()

  df_manhattan_2020.corr()


Unnamed: 0,LocationID,hour,day_of_week,pickup_count,dropoff_count,total_count,busyness,max_count,temperature,apparent_temperature,precipitation,weathercode,windspeed_10m
LocationID,1.0,0.001244,-0.000455,0.125649,0.117466,0.124678,0.003267,0.171732,0.004085,0.004124,0.000189,6.3e-05,-0.000212
hour,0.001244,1.0,-0.020539,0.120539,0.133319,0.130936,0.31709,-0.071205,0.223904,0.181777,0.042557,0.043696,0.105176
day_of_week,-0.000455,-0.020539,1.0,-0.02398,-0.02629,-0.025921,-0.080157,-0.001184,-0.006957,-0.007846,0.001242,0.002584,0.019783
pickup_count,0.125649,0.120539,-0.02398,1.0,0.891901,0.968673,0.500068,0.658864,-0.267082,-0.266741,-0.005753,0.002851,0.03336
dropoff_count,0.117466,0.133319,-0.02629,0.891901,1.0,0.976267,0.520957,0.654381,-0.28718,-0.28703,-0.006212,0.003361,0.035919
total_count,0.124678,0.130936,-0.025921,0.968673,0.976267,1.0,0.525558,0.674874,-0.285607,-0.285361,-0.006166,0.003211,0.0357
busyness,0.003267,0.31709,-0.080157,0.500068,0.520957,0.525558,1.0,0.026728,-0.000439,-0.008098,0.000808,-0.012511,0.009358
max_count,0.171732,-0.071205,-0.001184,0.658864,0.654381,0.674874,0.026728,1.0,-0.399806,-0.393623,-0.020021,0.006364,0.021437
temperature,0.004085,0.223904,-0.006957,-0.267082,-0.28718,-0.285607,-0.000439,-0.399806,1.0,0.992057,0.04364,0.030268,-0.147341
apparent_temperature,0.004124,0.181777,-0.007846,-0.266741,-0.28703,-0.285361,-0.008098,-0.393623,0.992057,1.0,0.048948,0.045264,-0.236848


In [8]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [9]:
def predict_busyness(zone, input_features):
    # Step 3: Use the zone number to select the corresponding trained model
    model = model_sets.get(zone)
    if model is None:
        raise ValueError("Model for zone {} not found.".format(zone))

    # Step 4: Use the selected model to make predictions based on the input features4
    
    input_data = pd.DataFrame([input_features], columns=X_train.columns)  # Convert to DataFrame
    busyness_prediction = model.predict(input_data)  # Make prediction

    # Step 5: Return the busyness prediction as the output
    return busyness_prediction


## Model for Each Zone

### Linear Regression

In [10]:
model_sets = {}
zones = df_train['LocationID'].unique()
zones

array([  4,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,  79,
        87,  88,  90, 100, 107, 113, 114, 116, 125, 137, 140, 141, 142,
       143, 144, 148, 151, 152, 158, 161, 162, 163, 164, 166, 170, 186,
       209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239,
       244, 246, 249, 261, 262, 263, 120, 127, 243,  12, 202, 194, 153,
       128], dtype=int64)

In [115]:
for zone in zones:
    
    # select one station
    zone_data = df_train[df_train['LocationID'] == zone]
    
    
  # linear regression feature
    X = zone_data[['hour', 'day_of_week','temperature','apparent_temperature','precipitation','weathercode','windspeed_10m']]
    
    y = zone_data['busyness']
    
    # split into train and test data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # apply LinearRegression
    model = LinearRegression()
    
    # fit model
    model.fit(X_train, y_train)
    # put into dictionary
    model_sets[zone] = model
    
    
    test_prediction = model.predict(X_test)
    print('\n==============================================================================')
    print("zone: ",zone," LinearRegression ")
    printMetrics(y_test, test_prediction)


zone:  4  LinearRegression 
MAE:  12.616806981581888
RMSE:  16.533687599139228
R2:  0.210181594539678

zone:  13  LinearRegression 
MAE:  14.101954261901499
RMSE:  17.624336664092446
R2:  0.1328372619899495

zone:  24  LinearRegression 
MAE:  14.953840705492558
RMSE:  18.23530564391177
R2:  0.10126691971325585

zone:  41  LinearRegression 
MAE:  12.050366285252387
RMSE:  15.111470515294329
R2:  0.18514128609574043

zone:  42  LinearRegression 
MAE:  10.0443790732712
RMSE:  13.045335360642945
R2:  0.21072767816878168

zone:  43  LinearRegression 
MAE:  18.806294194673935
RMSE:  22.37214739553507
R2:  0.07585448855659849

zone:  45  LinearRegression 
MAE:  13.634021488003063
RMSE:  17.20654905812489
R2:  0.035217773811516584

zone:  48  LinearRegression 
MAE:  14.559212618056781
RMSE:  18.164720990675928
R2:  0.2421534455829314

zone:  50  LinearRegression 
MAE:  14.707467785942198
RMSE:  18.30700974699538
R2:  0.1363398562061755

zone:  68  LinearRegression 
MAE:  14.955879505752304
RM

In [116]:
zone_number = 43  # Replace with the desired zone number
input_features = [17, 6, 27.4, 24.4, 0, 2, 19.8]  # Replace with the actual input features
prediction = predict_busyness(zone_number, input_features)
print("Busyness prediction for zone {}: {}".format(zone_number, prediction))

Busyness prediction for zone 43: [16.94088908]


## Random Forest

In [117]:
for zone in zones:
    
    # select one station
    zone_data = df_train[df_train['LocationID'] == zone]
    
    
  # linear regression feature
    X = zone_data[['hour', 'day_of_week','temperature','apparent_temperature','precipitation','weathercode','windspeed_10m']]
    
    y = zone_data['busyness']
    
    # split into train and test data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # apply Random Forest Regression
    model = RandomForestRegressor()
    
    # fit model
    model.fit(X_train, y_train)
    # put into dictionary
    model_sets[zone] = model
    
    
    test_prediction = model.predict(X_test)
    print('\n==============================================================================')
    print("zone: ",zone, "Random Forest Regression")
    printMetrics(y_test, test_prediction)


zone:  4 Random Forest Regression
MAE:  9.409891325938236
RMSE:  13.057525358428046
R2:  0.5073828231256702

zone:  13 Random Forest Regression
MAE:  9.998883969849976
RMSE:  13.74607792605468
R2:  0.472487626923763

zone:  24 Random Forest Regression
MAE:  9.954620309047774
RMSE:  13.424817598340942
R2:  0.5128963085598467

zone:  41 Random Forest Regression
MAE:  7.217689473609634
RMSE:  9.97192113575513
R2:  0.6451649368275716

zone:  42 Random Forest Regression
MAE:  7.128523940825953
RMSE:  9.610529902910988
R2:  0.571637893387887

zone:  43 Random Forest Regression
MAE:  10.241377587724504
RMSE:  14.529732524178062
R2:  0.610201419178663

zone:  45 Random Forest Regression
MAE:  10.77299797621794
RMSE:  14.419199096578282
R2:  0.3224770404825752

zone:  48 Random Forest Regression
MAE:  9.265859058505482
RMSE:  13.032011869029285
R2:  0.6099262269920108

zone:  50 Random Forest Regression
MAE:  9.551951454022978
RMSE:  13.136693137191946
R2:  0.5552866537598136

zone:  68 Random

In [118]:
zone_number = 43  # Replace with the desired zone number
input_features = [17, 6, 27.4, 24.4, 0, 2, 19.8]  # Replace with the actual input features
prediction = predict_busyness(zone_number, input_features)
print("Busyness prediction for zone {}: {}".format(zone_number, prediction))

Busyness prediction for zone 43: [47.47980797]


## XGBOOST

In [13]:
for zone in zones:
    
    # select one station
    zone_data = df_train[df_train['LocationID'] == zone]
    
    
  # linear regression feature
    X = zone_data[['hour','pickup_count','dropoff_count','total_count' ,'day_of_week','temperature','apparent_temperature','precipitation','weathercode','windspeed_10m']]
    
    y = zone_data['busyness']
    
    # split into train and test data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # apply Random Forest Regression
    model = XGBRegressor()
    
    # fit model
    model.fit(X_train, y_train)
    # put into dictionary
    model_sets[zone] = model
    
    
    test_prediction = model.predict(X_test)
    print('\n==============================================================================')
    print("zone: ",zone, "XGBoost Regression")
    printMetrics(y_test, test_prediction)


zone:  4 XGBoost Regression
MAE:  5.789974399827022
RMSE:  9.36904500944538
R2:  0.7463829504956332

zone:  13 XGBoost Regression
MAE:  3.938521214938471
RMSE:  6.9413452463467955
R2:  0.8654875945912507

zone:  24 XGBoost Regression
MAE:  4.658660279592823
RMSE:  7.206815175994899
R2:  0.859624472182626

zone:  41 XGBoost Regression
MAE:  3.7234368223603855
RMSE:  5.586577167400627
R2:  0.8886319893394007

zone:  42 XGBoost Regression
MAE:  5.2709831563380725
RMSE:  7.484300609235391
R2:  0.7402121579129957

zone:  43 XGBoost Regression
MAE:  3.63773838267778
RMSE:  6.158015856711097
R2:  0.9299824967339257

zone:  45 XGBoost Regression
MAE:  4.770117187707566
RMSE:  8.301466878585625
R2:  0.7754301655200686

zone:  48 XGBoost Regression
MAE:  3.792431814386362
RMSE:  6.574562033572071
R2:  0.9007209820985762

zone:  50 XGBoost Regression
MAE:  3.4940338227022463
RMSE:  6.57616423440564
R2:  0.8885568563665409

zone:  68 XGBoost Regression
MAE:  3.936024921342298
RMSE:  6.60132881542

In [121]:
zone_number = 43  # Replace with the desired zone number
input_features = [17, 6, 27.4, 24.4, 0, 2, 19.8]  # Replace with the actual input features
prediction = predict_busyness(zone_number, input_features)
print("Busyness prediction for zone {}: {}".format(zone_number, prediction))

Busyness prediction for zone 43: [52.029583]


## Done