In [61]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('Avacado_Price.csv')

In [3]:
df.head()

Unnamed: 0,AveragePrice,Total_Volume,tot_ava1,tot_ava2,tot_ava3,Total_Bags,Small_Bags,Large_Bags,XLarge Bags,type,year,region
0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


Checking the Null values:

In [37]:
df.isna().sum()

AveragePrice    0
Total_Volume    0
tot_ava1        0
tot_ava2        0
tot_ava3        0
Total_Bags      0
Small_Bags      0
Large_Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64

There are no null values in the dataset

In [4]:
df.shape

(18249, 12)

There are total of   
Rows : 18249  
columns : 12

In [45]:
x = df.drop('AveragePrice', axis = 'columns')
y = df['AveragePrice']

Seperating the Numeric and categorical Columns :

In [46]:
x.head()

Unnamed: 0,Total_Volume,tot_ava1,tot_ava2,tot_ava3,Total_Bags,Small_Bags,Large_Bags,XLarge Bags,type,year,region
0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [6]:
cat_col = x.select_dtypes(include = 'object')
num_col = x.select_dtypes(exclude = 'object')

In [7]:
cat_col

Unnamed: 0,type,region
0,conventional,Albany
1,conventional,Albany
2,conventional,Albany
3,conventional,Albany
4,conventional,Albany
...,...,...
18244,organic,WestTexNewMexico
18245,organic,WestTexNewMexico
18246,organic,WestTexNewMexico
18247,organic,WestTexNewMexico


Checking the number of Unique values in categorical columns

In [33]:
cat_col.nunique()

type       2
region    54
dtype: int64

In [36]:
cat_col['type'].unique()

array(['conventional', 'organic'], dtype=object)

In [28]:
enc = OneHotEncoder(drop = 'first')

In [29]:
data1 = enc.fit_transform(cat_col)

In [30]:
data1.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [31]:
data1 = pd.DataFrame(data1.toarray(), columns = enc.get_feature_names_out(cat_col.columns))

In [32]:
data1

Unnamed: 0,type_organic,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,region_Chicago,region_CincinnatiDayton,...,region_SouthCarolina,region_SouthCentral,region_Southeast,region_Spokane,region_StLouis,region_Syracuse,region_Tampa,region_TotalUS,region_West,region_WestTexNewMexico
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18245,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18246,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18247,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
num_col.head()

Unnamed: 0,Total_Volume,tot_ava1,tot_ava2,tot_ava3,Total_Bags,Small_Bags,Large_Bags,XLarge Bags,year
0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,2015
1,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,2015
2,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,2015
3,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,2015
4,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,2015


In [40]:
scaller = MinMaxScaler()

In [41]:
data2 = scaller.fit_transform(num_col)

In [42]:
data2

array([[1.02634162e-03, 4.55837802e-05, 2.66015275e-03, ...,
        1.63050227e-05, 0.00000000e+00, 0.00000000e+00],
       [8.76600710e-04, 2.96470005e-05, 2.18063319e-03, ...,
        1.70463985e-05, 0.00000000e+00, 0.00000000e+00],
       [1.89000237e-03, 3.49416730e-05, 5.33202818e-03, ...,
        1.80343168e-05, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.18895720e-04, 5.24067937e-05, 1.19820293e-04, ...,
        7.39802156e-06, 0.00000000e+00, 1.00000000e+00],
       [2.57907608e-04, 6.71674191e-05, 1.45625628e-04, ...,
        8.74263951e-06, 0.00000000e+00, 1.00000000e+00],
       [2.78455540e-04, 1.27278353e-04, 1.15098392e-04, ...,
        4.54792107e-06, 0.00000000e+00, 1.00000000e+00]])

In [43]:
data2 = pd.DataFrame(data2, columns = scaller.get_feature_names_out(num_col.columns))

In [44]:
data2.head()

Unnamed: 0,Total_Volume,tot_ava1,tot_ava2,tot_ava3,Total_Bags,Small_Bags,Large_Bags,XLarge Bags,year
0,0.001026,4.6e-05,0.00266,1.9e-05,0.000449,0.000643,1.6e-05,0.0,0.0
1,0.000877,3e-05,0.002181,2.3e-05,0.000491,0.000703,1.7e-05,0.0,0.0
2,0.00189,3.5e-05,0.005332,5.1e-05,0.00042,0.000601,1.8e-05,0.0,0.0
3,0.001262,5e-05,0.003516,2.9e-05,0.0003,0.000424,2.3e-05,0.0,0.0
4,0.000815,4.1e-05,0.002142,3e-05,0.000319,0.000447,3.5e-05,0.0,0.0


In [53]:
X = pd.concat([data1, data2], axis = 'columns')

In [54]:
X.head()

Unnamed: 0,type_organic,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,region_Chicago,region_CincinnatiDayton,...,region_WestTexNewMexico,Total_Volume,tot_ava1,tot_ava2,tot_ava3,Total_Bags,Small_Bags,Large_Bags,XLarge Bags,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001026,4.6e-05,0.00266,1.9e-05,0.000449,0.000643,1.6e-05,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000877,3e-05,0.002181,2.3e-05,0.000491,0.000703,1.7e-05,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00189,3.5e-05,0.005332,5.1e-05,0.00042,0.000601,1.8e-05,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001262,5e-05,0.003516,2.9e-05,0.0003,0.000424,2.3e-05,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000815,4.1e-05,0.002142,3e-05,0.000319,0.000447,3.5e-05,0.0,0.0


Here features are 63 so we will reduce the features by using PCA (Principle Compponent Analysis):

In [55]:
component_analyzer = PCA(n_components=5)

In [56]:
X = component_analyzer.fit_transform(X)

In [58]:
X

array([[ 4.96509755e-01, -3.83858281e-01, -4.05893451e-02,
        -1.98412628e-04,  3.59023328e-05],
       [ 4.96493846e-01, -3.83857393e-01, -4.07466970e-02,
        -2.02554564e-04,  4.46130685e-05],
       [ 4.96610292e-01, -3.83857109e-01, -3.95870994e-02,
        -1.72629089e-04, -1.24556448e-05],
       ...,
       [-5.00521337e-01,  6.17284598e-01, -3.28850797e-02,
         1.06792422e-02, -1.47527910e-02],
       [-5.00514052e-01,  6.17287321e-01, -3.28092908e-02,
         1.06793481e-02, -1.47534994e-02],
       [-5.00513180e-01,  6.17290001e-01, -3.27975589e-02,
         1.06736133e-02, -1.47558103e-02]])

Splitting the dataset into training and testing Chunks:

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [63]:
def eval_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    return mse, mae, rmse

In [67]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(n_estimators = 100),
    'Ada Boost Regression' : AdaBoostRegressor(),
    'XGB Regressor' : XGBRegressor()
}

mdl = []
scr = []

for i in range(len(list(models.values()))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Recording the prediction of model on testing dataset
    pred = model.predict(X_test)

    # Evaluating model
    mse, mae, rmse = eval_model(y_test, pred)

    mdl.append(list(models.keys())[i])
    scr.append(rmse)

    print('----------------------------------------------------- \n ')
    print('{} Model has :'.format(list(models.keys())[i]))
    print('Root Mean Squared Error : {}'.format(rmse))
    print('Mean Squared Error : {}'.format(mse))
    print('Mean Absolute Error : {}\n'.format(mae))

    print('----------------------------------------------------- \n  ')

----------------------------------------------------- 
 
Linear Regression Model has :
Root Mean Squared Error : 0.3070745152950607
Mean Squared Error : 0.09429475794369648
Mean Absolute Error : 0.23335029650246789

----------------------------------------------------- 
  
----------------------------------------------------- 
 
Decision Tree Regression Model has :
Root Mean Squared Error : 0.19436237114502367
Mean Squared Error : 0.03777673131711593
Mean Absolute Error : 0.13064102564102564

----------------------------------------------------- 
  
----------------------------------------------------- 
 
Random Forest Regression Model has :
Root Mean Squared Error : 0.1520723405123593
Mean Squared Error : 0.023125996748906953
Mean Absolute Error : 0.10620689484779228

----------------------------------------------------- 
  
----------------------------------------------------- 
 
Ada Boost Regression Model has :
Root Mean Squared Error : 0.29563238125578717
Mean Squared Error : 0.087

In [68]:
scr

[0.3070745152950607,
 0.19436237114502367,
 0.1520723405123593,
 0.29563238125578717,
 0.1653080767412612]

In [69]:
mdl

['Linear Regression',
 'Decision Tree Regression',
 'Random Forest Regression',
 'Ada Boost Regression',
 'XGB Regressor']

In [74]:
report = list(zip(mdl, scr))

In [79]:
model_report = pd.DataFrame(report , columns = ['Model', 'Root Mean Squared Error']).sort_values(by = 'Root Mean Squared Error', ascending = False)

In [80]:
model_report

Unnamed: 0,Model,Root Mean Squared Error
0,Linear Regression,0.307075
3,Ada Boost Regression,0.295632
1,Decision Tree Regression,0.194362
4,XGB Regressor,0.165308
2,Random Forest Regression,0.152072


Here ,   
Linear Regression Model has performed exceptionally well and has best fit    
Ada Boost has also performed well has wll fit    
Random Forest Regression has worst fit 