# **This notebook is for doing imputations on the DataFrames null values.** 

#### We are going to USE ML to fill the dataframes missing NAN values. We are beggining with The values that have the least NAN values and build up the prediction strength from ground up.

In [124]:
import pandas as pd
import numpy as np

### Read in the repaird taxi trip csv:

In [125]:
df = pd.read_csv("../../data/taxi_trip_pricing_numeric_repaired.csv")

### Perform one hot encoding on categorical columns

In [126]:
df = pd.get_dummies(df, columns=["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather"], drop_first=True)

In [127]:
df.isna().sum()

Trip_Distance_km              6
Passenger_Count              50
Base_Fare                    12
Per_Km_Rate                  11
Per_Minute_Rate               5
Trip_Duration_Minutes        15
Trip_Price                   17
Time_of_Day_Evening           0
Time_of_Day_Morning           0
Time_of_Day_Night             0
Day_of_Week_Weekend           0
Traffic_Conditions_Low        0
Traffic_Conditions_Medium     0
Weather_Rain                  0
Weather_Snow                  0
dtype: int64

In [128]:
# Store the column names that contain NaN values
columns_with_nan = (
    df.isna().sum()
    .sort_values(ascending=True)   
    .loc[lambda x: x > 0]          
    .index
    .tolist()
)

columns_with_nan.pop(-1) # REMOVE PASSENGER COUNT THIS COL NEEDS CLASSIFICATION
display(columns_with_nan)


['Per_Minute_Rate',
 'Trip_Distance_km',
 'Per_Km_Rate',
 'Base_Fare',
 'Trip_Duration_Minutes',
 'Trip_Price']

#### MANAGE all columns with a function. The function will test the best model to predict its nan values.

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor


# We will move everythin to a function instead:

def remove_nan_values_and_evaluate_model(column_name: str, show_output: bool = True):
    """
    This function takes a column name as input and removes the rows with NaN values in that column from the dataframe.
    It returns two dataframes: one with the rows with NaN values and one without.
    """
    nan_rows = df[df[column_name].isna()] # We will use these rows later to predict the missing values
    df_ml = df[df[column_name].notna()]

    # Remove nan values from df_ml
    df_ml = df_ml.dropna()

    # Train test split
    X,y = df_ml.drop(columns=[column_name]), df_ml[column_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = MinMaxScaler()

    # Train and evaluate X,y on Linnear Regression 
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    ln_model = LinearRegression()
    ln_model.fit(X_train_scaled, y_train)
    y_pred = ln_model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    if show_output:
        print(f"Linear Regression - Mean Squared Error: {mse:.2f}")
        print(f"Linear Regression - Root Mean Squared Error: {rmse:.2f}")
        print(f"Linear Regression - R^2 Score: {r2:.2f}")
        print(f"Linear Regression - Mean Absolute Error: {mae:.2f}")
        print("--------------------------------------------------")
    # Train and evaluate X,y on Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    mse_rf = mean_squared_error(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    mae_rf = mean_absolute_error(y_test, y_pred_rf)
    if show_output:
        print(f"Random Forest - Mean Squared Error: {mse_rf:.2f}")
        print(f"Random Forest - Root Mean Squared Error: {rmse_rf:.2f}")
        print(f"Random Forest - R^2 Score: {r2_rf:.2f}")
        print(f"Random Forest - Mean Absolute Error: {mae_rf:.2f}")
        print("--------------------------------------------------")
    # Choose the best model based on R^2 score
    model_winner = None
    if r2_rf > r2:
        model_winner = rf_model
        print("Random Forest Regressor is the best model.")
    else:
        model_winner = ln_model
        print("Linear Regression is the best model.")

    return nan_rows, df_ml, model_winner


### Loop through all cols in ascending order from least nan values to greatest for best imputations.

In [130]:
# Now we can loop through the columns with NaN values and evaluate the models
for column in columns_with_nan:
    print(f"Evaluating models after removing NaN values in column: {column}")
    nan_rows, df_ml, best_model = remove_nan_values_and_evaluate_model(column, show_output=True) # IF you want to see the output from the model evaluation, set show_output=True

    X,y = nan_rows.drop(columns=[column]), nan_rows[column]
    predicted_values = best_model.predict(X) 
    print(f"Predicted values for missing {column}: {predicted_values}")
    # Now we can fill the NaN values in the original dataframe with the predicted values
    nan_rows[column] = predicted_values
    df.loc[nan_rows.index, column] = nan_rows[column]
    print(f"\n\n\n\n")

# Check if there are still any NaN values in the dataframe
df.isna().sum()

Evaluating models after removing NaN values in column: Per_Minute_Rate
Linear Regression - Mean Squared Error: 0.01
Linear Regression - Root Mean Squared Error: 0.10
Linear Regression - R^2 Score: 0.23
Linear Regression - Mean Absolute Error: 0.08
--------------------------------------------------
Random Forest - Mean Squared Error: 0.01
Random Forest - Root Mean Squared Error: 0.09
Random Forest - R^2 Score: 0.38
Random Forest - Mean Absolute Error: 0.07
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Per_Minute_Rate: [0.64379629 0.3818     0.3346     0.2351     0.302     ]





Evaluating models after removing NaN values in column: Trip_Distance_km
Linear Regression - Mean Squared Error: 51.06
Linear Regression - Root Mean Squared Error: 7.15
Linear Regression - R^2 Score: 0.87
Linear Regression - Mean Absolute Error: 4.65
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest - Mean Squared Error: 33.47
Random Forest - Root Mean Squared Error: 5.79
Random Forest - R^2 Score: 0.92
Random Forest - Mean Absolute Error: 3.95
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Trip_Distance_km: [42.4518  6.1342 21.4368  7.6969 42.7079 29.7991]





Evaluating models after removing NaN values in column: Per_Km_Rate
Linear Regression - Mean Squared Error: 0.12
Linear Regression - Root Mean Squared Error: 0.35
Linear Regression - R^2 Score: 0.33
Linear Regression - Mean Absolute Error: 0.26
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest - Mean Squared Error: 0.08
Random Forest - Root Mean Squared Error: 0.28
Random Forest - R^2 Score: 0.58
Random Forest - Mean Absolute Error: 0.20
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Per_Km_Rate: [0.6479     1.6729     0.9724     1.32234617 0.9303     1.2786
 1.3028     0.7434     1.6496     0.8845     1.7692    ]





Evaluating models after removing NaN values in column: Base_Fare
Linear Regression - Mean Squared Error: 0.79
Linear Regression - Root Mean Squared Error: 0.89
Linear Regression - R^2 Score: -0.05
Linear Regression - Mean Absolute Error: 0.77
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest - Mean Squared Error: 0.78
Random Forest - Root Mean Squared Error: 0.88
Random Forest - R^2 Score: -0.03
Random Forest - Mean Absolute Error: 0.75
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Base_Fare: [3.5096 3.9683 3.5614 3.5159 3.1606 3.3182 3.5021 3.3197 4.0192 3.6092
 3.5524 3.4896]





Evaluating models after removing NaN values in column: Trip_Duration_Minutes
Linear Regression - Mean Squared Error: 587.81
Linear Regression - Root Mean Squared Error: 24.24
Linear Regression - R^2 Score: 0.40
Linear Regression - Mean Absolute Error: 19.89
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest - Mean Squared Error: 309.74
Random Forest - Root Mean Squared Error: 17.60
Random Forest - R^2 Score: 0.68
Random Forest - Mean Absolute Error: 13.59
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Trip_Duration_Minutes: [ 21.2082  73.6751  90.6529  99.485   62.6265  63.7791  49.3154  63.985
  86.0036  61.5466  40.0069  63.0559 100.1809  55.6358  96.1719]





Evaluating models after removing NaN values in column: Trip_Price
Linear Regression - Mean Squared Error: 187.00
Linear Regression - Root Mean Squared Error: 13.67
Linear Regression - R^2 Score: 0.81
Linear Regression - Mean Absolute Error: 9.00
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Random Forest - Mean Squared Error: 37.03
Random Forest - Root Mean Squared Error: 6.09
Random Forest - R^2 Score: 0.96
Random Forest - Mean Absolute Error: 3.84
--------------------------------------------------
Random Forest Regressor is the best model.
Predicted values for missing Trip_Price: [47.070264 68.531781 27.862327 51.99042  37.294758 71.288691 48.343912
 40.157827 51.667804 50.690136 55.391044 66.808003 21.647542 44.7242
 32.883705 35.525655 49.548262]







A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_rows[column] = predicted_values


Trip_Distance_km              0
Passenger_Count              50
Base_Fare                     0
Per_Km_Rate                   0
Per_Minute_Rate               0
Trip_Duration_Minutes         0
Trip_Price                    0
Time_of_Day_Evening           0
Time_of_Day_Morning           0
Time_of_Day_Night             0
Day_of_Week_Weekend           0
Traffic_Conditions_Low        0
Traffic_Conditions_Medium     0
Weather_Rain                  0
Weather_Snow                  0
dtype: int64

#### BASE_FARE is Catastrophic BAD.

In [131]:
pass_count_nan = df[df["Passenger_Count"].isna()] # Save this to end.
pass_count_notna = df[df["Passenger_Count"].notna()]


In [132]:
X, y = pass_count_notna.drop(columns="Passenger_Count"), pass_count_notna["Passenger_Count"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [133]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


rfc_modell = RandomForestClassifier(random_state=42)

rfc_modell.fit(X=X_train, y=y_train)

rfc_y_pred = rfc_modell.predict(X=X_test)

rfc_mae = mean_absolute_error(y_true=y_test, y_pred=rfc_y_pred)
rfc_mse = mean_squared_error(y_true=y_test, y_pred=rfc_y_pred)
rfc_r2 = r2_score(y_true=y_test, y_pred=rfc_y_pred)

accuracy = accuracy_score(y_true=y_test, y_pred=rfc_y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_true=y_test, y_pred=rfc_y_pred))


Accuracy: 0.21

Classification Report:
              precision    recall  f1-score   support

         1.0       0.27      0.33      0.30        46
         2.0       0.09      0.11      0.09        38
         3.0       0.20      0.21      0.21        53
         4.0       0.29      0.19      0.23        53

    accuracy                           0.21       190
   macro avg       0.21      0.21      0.21       190
weighted avg       0.22      0.21      0.21       190



In [134]:
df["Passenger_Count"].value_counts()

Passenger_Count
3.0    251
2.0    241
1.0    238
4.0    220
Name: count, dtype: int64

In [135]:
rnd_values = []

for i in range(50):
    if i < 13:
        rnd_values.append(1.0)
    elif i < 25:
        rnd_values.append(2.0)
    elif i < 38:
        rnd_values.append(3.0)
    else:
        rnd_values.append(4.0)

passenger_count_nan = df[df["Passenger_Count"].isna()]

passenger_count_nan

df.loc[passenger_count_nan.index, "Passenger_Count"] = rnd_values


In [136]:
# Check the rows with index that had NaN values in Passenger_Count
df.loc[passenger_count_nan.index]

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
19,15.27,1.0,3.93,0.73,0.12,102.31,27.3543,False,True,False,False,True,False,False,False
33,47.5,1.0,4.39,0.51,0.3,95.55,57.28,False,True,False,True,True,False,False,False
43,45.56,1.0,4.48,0.9,0.5,80.8,85.884,False,False,False,False,True,False,False,False
75,36.72,1.0,2.39,1.56,0.17,13.03,61.8883,False,False,False,True,True,False,False,False
77,4.63,1.0,4.95,0.91,0.39,24.87,18.8626,False,False,False,False,False,True,False,False
100,2.54,1.0,2.8,1.3,0.37,53.11,25.7527,False,True,False,True,True,False,False,False
173,12.1,1.0,3.5159,0.59,0.31,21.2082,19.2643,False,True,False,True,True,False,False,False
213,11.99,1.0,4.84,1.74,0.28,54.06,40.8394,True,False,False,True,False,True,True,False
244,1.76,1.0,2.49,1.22,0.29,64.4,23.3132,True,False,False,True,True,False,False,False
249,42.8,1.0,4.06,1.11,0.36,16.62,57.5512,False,False,False,False,False,False,True,False


In [137]:
df["Passenger_Count"].value_counts()

Passenger_Count
3.0    264
2.0    253
1.0    251
4.0    232
Name: count, dtype: int64

In [138]:
df.to_csv("../../data/taxi_trip_pricing_imputed.csv", index=False)