In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset (you'll need to replace this with your actual data loading method)
def load_taxi_data():
    df1 = pd.read_parquet('../data/raw/yellow_tripdata_2024-01.parquet')
    df2 = pd.read_parquet('../data/raw/yellow_tripdata_2024-02.parquet')
    df3 = pd.read_parquet('../data/raw/yellow_tripdata_2024-03.parquet')
    df4 = pd.read_parquet('../data/raw/yellow_tripdata_2024-04.parquet')
    df5 = pd.read_parquet('../data/raw/yellow_tripdata_2024-05.parquet')
    df6 = pd.read_parquet('../data/raw/yellow_tripdata_2024-06.parquet')
    df = pd.concat([df1, df2, df3, df4, df5, df6])
    return df

In [27]:
def downcast_dtypes(df):
    # Downcast numerical columns to save memory
    float_cols = df.select_dtypes(include=['float64']).columns
    int_cols = df.select_dtypes(include=['int64']).columns

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)

    return df

def preprocess_data(df):
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    imputer = SimpleImputer(strategy='median')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Downcast dtypes
    df = downcast_dtypes(df)
    
    # Feature Engineering
    df['pickup_hour'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.hour
    df['pickup_day'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.day_name()
    df['pickup_month'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.month
    
    # Encode categorical variables
    categorical_columns = ['pickup_day', 'RatecodeID', 'payment_type']
    df = pd.get_dummies(df, columns=categorical_columns)
    
    return df


In [15]:
# Feature Selection and Preparation
def prepare_features(df):
    # Select relevant features for regression
    features = [
        'trip_distance', 
        'passenger_count', 
        'pickup_hour', 
        'pickup_month', 
        'fare_amount', 
        'extra', 
        'mta_tax', 
        'improvement_surcharge'
    ]
    
    # Add dummy variables from preprocessing
    features.extend([col for col in df.columns if col.startswith(('pickup_day_', 'RatecodeID_', 'payment_type_'))])
    
    # Prepare X and y
    X = df[features]
    y = df['total_amount']
    
    return X, y


In [25]:
# Cross-validation Function (similar to your approach)
def cross_validate_models(X, y, models, dataset_name):
    results = []
    
    for name, model in models.items():
        try:
            # Use train_test_split instead of full cross-validation for large datasets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Standardize features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Fit the model
            model.fit(X_train_scaled, y_train)
            
            # Predict and calculate RMSE
            y_pred = model.predict(X_test_scaled)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            
            # Store results
            results.append({
                "Dataset": dataset_name,
                "Model": name,
                "RMSE": rmse
            })
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
    
    return results

In [5]:
# Feature Importance (for Random Forest)
def get_feature_importance(X, y):
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X, y)
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return feature_importance

In [6]:
# Visualization of Results
def visualize_results(results_df, X, feature_importance):
    # Plot model performance
    plt.figure(figsize=(10, 6))
    plt.bar(results_df['Model'], results_df['RMSE'])
    plt.title('Model Performance Comparison (RMSE)')
    plt.xlabel('Models')
    plt.ylabel('Root Mean Squared Error')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    feature_importance.head(10).plot(kind='bar', x='feature', y='importance')
    plt.title('Top 10 Feature Importances')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()


In [10]:
df = load_taxi_data()
df.head()
# Preprocess the data

    

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [28]:
processed_df = preprocess_data(df)
# print(processed_df)

MemoryError: Unable to allocate 1.36 GiB for an array with shape (9, 20332093) and data type float64

In [16]:
# Prepare features
X, y = prepare_features(processed_df)

In [19]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42)
    # 'Lasso': Lasso(random_state=42),
    # 'ElasticNet': ElasticNet(random_state=42)
}

In [26]:
all_results = []
# 1. Initial Cross-Validation
print("Initial Cross-Validation Results:")
initial_results = cross_validate_models(X, y, models, "Initial Dataset")
all_results.extend(initial_results)

Initial Cross-Validation Results:
Error with Linear Regression: Unable to allocate 745. MiB for an array with shape (6, 16265674) and data type float64
Error with Ridge: Unable to allocate 745. MiB for an array with shape (6, 16265674) and data type float64
