In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)


In [2]:

url = 'https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/avocado.csv'
data = pd.read_csv(url)


In [3]:
print(data.head())

   Unnamed: 0        Date  AveragePrice  Total Volume     4046       4225  \
0         0.0  27-12-2015          1.33      64236.62  1036.74   54454.85   
1         1.0  20-12-2015          1.35      54876.98   674.28   44638.81   
2         2.0  13-12-2015          0.93     118220.22   794.70  109149.67   
3         3.0  06-12-2015          1.08      78992.15  1132.00   71976.41   
4         4.0  29-11-2015          1.28      51039.60   941.48   43838.39   

     4770  Total Bags  Small Bags  Large Bags  XLarge Bags          type  \
0   48.16     8696.87     8603.62       93.25          0.0  conventional   
1   58.33     9505.56     9408.07       97.49          0.0  conventional   
2  130.50     8145.35     8042.21      103.14          0.0  conventional   
3   72.58     5811.16     5677.40      133.76          0.0  conventional   
4   75.78     6183.95     5986.26      197.69          0.0  conventional   

     year  region  
0  2015.0  Albany  
1  2015.0  Albany  
2  2015.0  Albany  


In [4]:

print(data.isnull().sum())


Unnamed: 0      14951
Date            14951
AveragePrice    14951
Total Volume    14951
4046            14951
4225            14951
4770            14951
Total Bags      14951
Small Bags      14951
Large Bags      14951
XLarge Bags     14951
type            14951
year            14951
region          14951
dtype: int64


In [5]:

numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

In [6]:

categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])


In [7]:

print(data.isnull().sum())

Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64


In [8]:

data = pd.get_dummies(data, drop_first=True)

In [9]:

print(data.columns)

Index(['Unnamed: 0', 'AveragePrice', 'Total Volume', '4046', '4225', '4770',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags',
       ...
       'region_SouthCarolina', 'region_SouthCentral', 'region_Southeast',
       'region_Spokane', 'region_StLouis', 'region_Syracuse', 'region_Tampa',
       'region_TotalUS', 'region_West', 'region_WestTexNewMexico'],
      dtype='object', length=164)


In [10]:

X_price = data.drop(columns=['AveragePrice', 'Date'], errors='ignore')
y_price = data['AveragePrice']


In [11]:

if 'type_organic' in data.columns:
    y_type = data['type_organic']
else:
    y_type = np.zeros(len(data))

X_type = data.drop(columns=['type', 'Date'], errors='ignore')

In [12]:

X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X_price, y_price, test_size=0.2, random_state=42)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(X_type, y_type, test_size=0.2, random_state=42)

print(f"Training set size for price prediction: {X_train_price.shape}")
print(f"Training set size for type prediction: {X_train_type.shape}")


Training set size for price prediction: (13174, 163)
Training set size for type prediction: (13174, 164)


In [13]:
# RandomForest Regressor for price prediction
rf_price = RandomForestRegressor()
rf_price.fit(X_train_price, y_train_price)
y_pred_rf_price = rf_price.predict(X_test_price)
mse_rf_price = mean_squared_error(y_test_price, y_pred_rf_price)
rmse_rf_price = mse_rf_price ** 0.5
print(f"RandomForest RMSE for price prediction: {rmse_rf_price}")

#  RandomForest Classifier for type prediction
rf_type = RandomForestClassifier()
rf_type.fit(X_train_type, y_train_type)
y_pred_rf_type = rf_type.predict(X_test_type)
accuracy_rf_type = accuracy_score(y_test_type, y_pred_rf_type)
print(f"RandomForest Accuracy for type prediction: {accuracy_rf_type}")


RandomForest RMSE for price prediction: 0.02396061013177811
RandomForest Accuracy for type prediction: 1.0


In [14]:
# Linear Regression model for price prediction
lr_price = LinearRegression()
lr_price.fit(X_train_price, y_train_price)
y_pred_lr_price = lr_price.predict(X_test_price)
mse_lr_price = mean_squared_error(y_test_price, y_pred_lr_price)
rmse_lr_price = mse_lr_price ** 0.5
print(f"Linear Regression RMSE for price prediction: {rmse_lr_price}")


if len(np.unique(y_train_type)) > 1:
    # Logistic Regression model for type prediction
    lr_type = LogisticRegression()
    lr_type.fit(X_train_type, y_train_type)
    y_pred_lr_type = lr_type.predict(X_test_type)
    accuracy_lr_type = accuracy_score(y_test_type, y_pred_lr_type)
    print(f"Logistic Regression Accuracy for type prediction: {accuracy_lr_type}")
else:
    print('Classification task is not possible, as there is only one class in the data.')


Linear Regression RMSE for price prediction: 0.026724790063832726
Classification task is not possible, as there is only one class in the data.


In [15]:
# Gradient Boosting Regressor for price prediction
gb_price = GradientBoostingRegressor()
gb_price.fit(X_train_price, y_train_price)
y_pred_gb_price = gb_price.predict(X_test_price)
mse_gb_price = mean_squared_error(y_test_price, y_pred_gb_price)
rmse_gb_price = mse_gb_price ** 0.5
print(f"Gradient Boosting RMSE for price prediction: {rmse_gb_price}")

if len(np.unique(y_train_type)) > 1:
    # Gradient Boosting Classifier for type prediction
    gb_type = GradientBoostingClassifier()
    gb_type.fit(X_train_type, y_train_type)
    y_pred_gb_type = gb_type.predict(X_test_type)
    accuracy_gb_type = accuracy_score(y_test_type, y_pred_gb_type)
    print(f"Gradient Boosting Accuracy for type prediction: {accuracy_gb_type}")
else:
    print('Classification task is not possible, as there is only one class in the data.')


Gradient Boosting RMSE for price prediction: 0.030586366246195828
Classification task is not possible, as there is only one class in the data.


In [16]:
# Perform hyperparameter tuning for Gradient Boosting Regressor using GridSearchCV
param_grid_gb_price = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search_gb_price = GridSearchCV(estimator=gb_price, param_grid=param_grid_gb_price, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb_price.fit(X_train_price, y_train_price)

# Best model from grid search for price prediction
best_gb_price = grid_search_gb_price.best_estimator_
best_y_pred_gb_price = best_gb_price.predict(X_test_price)
best_mse_gb_price = mean_squared_error(y_test_price, best_y_pred_gb_price)
best_rmse_gb_price = best_mse_gb_price ** 0.5
print(f"Best Gradient Boosting RMSE for price prediction after tuning: {best_rmse_gb_price}")
print(f"Best parameters for Gradient Boosting Regressor: {grid_search_gb_price.best_params_}")


Best Gradient Boosting RMSE for price prediction after tuning: 0.02260598119037988
Best parameters for Gradient Boosting Regressor: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
