# **This notebook is for doing imputations on the DataFrames null values.** 

#### We are going to USE ML to fill the dataframes missing NAN values. We are beggining with The values that have the least NAN values and build up the prediction strength from ground up.

In [146]:
import pandas as pd
import numpy as np

### Read in the repaird taxi trip csv:

In [147]:
df = pd.read_csv("../../data/taxi_trip_pricing_numeric_repaired.csv")

### Perform one hot encoding on categorical columns

In [148]:
df = pd.get_dummies(df, columns=["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather"], drop_first=True)

In [149]:
df.isna().sum()

Trip_Distance_km              6
Passenger_Count              50
Base_Fare                    12
Per_Km_Rate                  11
Per_Minute_Rate               5
Trip_Duration_Minutes        15
Trip_Price                   17
Time_of_Day_Evening           0
Time_of_Day_Morning           0
Time_of_Day_Night             0
Day_of_Week_Weekend           0
Traffic_Conditions_Low        0
Traffic_Conditions_Medium     0
Weather_Rain                  0
Weather_Snow                  0
dtype: int64

In [150]:
# Store the column names that contain NaN values
columns_with_nan = (
    df.isna().sum()
    .sort_values(ascending=True)   
    .loc[lambda x: x > 0]          
    .index
    .tolist()
)
columns_with_nan


['Per_Minute_Rate',
 'Trip_Distance_km',
 'Per_Km_Rate',
 'Base_Fare',
 'Trip_Duration_Minutes',
 'Trip_Price',
 'Passenger_Count']

#### MANAGE all columns with a function. The function will test the best model to predict its nan values.

In [151]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor


# We will move everythin to a function instead:

def remove_nan_values_and_evaluate_model(column_name: str, show_output: bool = True):
    """
    This function takes a column name as input and removes the rows with NaN values in that column from the dataframe.
    It returns two dataframes: one with the rows with NaN values and one without.
    """
    nan_rows = df[df[column_name].isna()] # We will use these rows later to predict the missing values
    df_ml = df[df[column_name].notna()]

    # Remove nan values from df_ml
    df_ml = df_ml.dropna()

    # Train test split
    X,y = df_ml.drop(columns=[column_name]), df_ml[column_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = MinMaxScaler()

    # Train and evaluate X,y on Linnear Regression 
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    ln_model = LinearRegression()
    ln_model.fit(X_train_scaled, y_train)
    y_pred = ln_model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    if show_output:
        print(f"Linear Regression - Mean Squared Error: {mse:.2f}")
        print(f"Linear Regression - Root Mean Squared Error: {rmse:.2f}")
        print(f"Linear Regression - R^2 Score: {r2:.2f}")
        print(f"Linear Regression - Mean Absolute Error: {mae:.2f}")
        print("\n")
        print("--------------------------------------------------")
    # Train and evaluate X,y on Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    mse_rf = mean_squared_error(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    mae_rf = mean_absolute_error(y_test, y_pred_rf)
    if show_output:
        print(f"Random Forest - Mean Squared Error: {mse_rf:.2f}")
        print(f"Random Forest - Root Mean Squared Error: {rmse_rf:.2f}")
        print(f"Random Forest - R^2 Score: {r2_rf:.2f}")
        print(f"Random Forest - Mean Absolute Error: {mae_rf:.2f}")
        print("\n")
        print("--------------------------------------------------")
    # Choose the best model based on R^2 score
    model_winner = None
    if r2_rf > r2:
        model_winner = rf_model
        print("Random Forest Regressor is the best model.")
    else:
        model_winner = ln_model
        print("Linear Regression is the best model.")

    return nan_rows, df_ml, model_winner


### Loop through all cols in ascending order from least nan values to greatest for best imputations.

In [152]:
# Now we can loop through the columns with NaN values and evaluate the models
for column in columns_with_nan:
    print(f"Evaluating models after removing NaN values in column: {column}")
    nan_rows, df_ml, best_model = remove_nan_values_and_evaluate_model(column, show_output=False) # IF you want to see the output from the model evaluation, set show_output=True

    X,y = nan_rows.drop(columns=[column]), nan_rows[column]
    predicted_values = best_model.predict(X) 
    print(f"Predicted values for missing {column}: {predicted_values}")
    # Now we can fill the NaN values in the original dataframe with the predicted values
    nan_rows[column] = predicted_values
    df.loc[nan_rows.index, column] = nan_rows[column]

# Check if there are still any NaN values in the dataframe
df.isna().sum()

Evaluating models after removing NaN values in column: Per_Minute_Rate
Random Forest Regressor is the best model.
Predicted values for missing Per_Minute_Rate: [0.64379629 0.3818     0.3346     0.2351     0.302     ]
Evaluating models after removing NaN values in column: Trip_Distance_km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest Regressor is the best model.
Predicted values for missing Trip_Distance_km: [42.4518  6.1342 21.4368  7.6969 42.7079 29.7991]
Evaluating models after removing NaN values in column: Per_Km_Rate


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest Regressor is the best model.
Predicted values for missing Per_Km_Rate: [0.6479     1.6729     0.9724     1.32234617 0.9303     1.2786
 1.3028     0.7434     1.6496     0.8845     1.7692    ]
Evaluating models after removing NaN values in column: Base_Fare


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest Regressor is the best model.
Predicted values for missing Base_Fare: [3.5096 3.9683 3.5614 3.5159 3.1606 3.3182 3.5021 3.3197 4.0192 3.6092
 3.5524 3.4896]
Evaluating models after removing NaN values in column: Trip_Duration_Minutes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest Regressor is the best model.
Predicted values for missing Trip_Duration_Minutes: [ 21.2082  73.6751  90.6529  99.485   62.6265  63.7791  49.3154  63.985
  86.0036  61.5466  40.0069  63.0559 100.1809  55.6358  96.1719]
Evaluating models after removing NaN values in column: Trip_Price


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest Regressor is the best model.
Predicted values for missing Trip_Price: [47.070264 68.531781 27.862327 51.99042  37.294758 71.288691 48.343912
 40.157827 51.667804 50.690136 55.391044 66.808003 21.647542 44.7242
 32.883705 35.525655 49.548262]
Evaluating models after removing NaN values in column: Passenger_Count


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Linear Regression is the best model.
Predicted values for missing Passenger_Count: [11.78394126 -3.31729972 12.00816035  1.75088157 10.65582058 18.69084763
  3.53923319 18.05833162 19.02613313 -5.74312702 26.02125989 -7.98630336
 31.64925081 -4.71878541 13.95578531 10.38283726 17.37904706  8.45890165
 10.33347193 19.16154542  8.42359912 19.00521702 16.43784593 12.94822544
 14.2458445   6.99871322 17.77928753  9.04466503 33.59823463  9.84663492
 20.21899523  7.95480414 13.71289799 13.79019527 19.36047216 17.48072031
 25.92093333 11.53462095  7.04954361 20.77214381 -8.00404098 26.47973409
  3.1023654  13.36835367  7.12911652 -7.6635459   9.63059509 25.60953738
 -7.70646305 30.71956094]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Trip_Distance_km             0
Passenger_Count              0
Base_Fare                    0
Per_Km_Rate                  0
Per_Minute_Rate              0
Trip_Duration_Minutes        0
Trip_Price                   0
Time_of_Day_Evening          0
Time_of_Day_Morning          0
Time_of_Day_Night            0
Day_of_Week_Weekend          0
Traffic_Conditions_Low       0
Traffic_Conditions_Medium    0
Weather_Rain                 0
Weather_Snow                 0
dtype: int64