In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

In [3]:
df = pd.read_csv("../data/avocado.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,27-12-2015,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,13-12-2015,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,06-12-2015,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,29-11-2015,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [5]:
# Preprocessing [Generating Some New Features]
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df = df.drop(['Date', 'Unnamed: 0'], axis=1)
df['type'] = df['type'].replace({'conventional': 0, 'organic': 1})

  df['type'] = df['type'].replace({'conventional': 0, 'organic': 1})


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AveragePrice  18249 non-null  float64
 1   Total Volume  18249 non-null  float64
 2   4046          18249 non-null  float64
 3   4225          18249 non-null  float64
 4   4770          18249 non-null  float64
 5   Total Bags    18249 non-null  float64
 6   Small Bags    18249 non-null  float64
 7   Large Bags    18249 non-null  float64
 8   XLarge Bags   18249 non-null  float64
 9   type          18249 non-null  int64  
 10  year          18249 non-null  int64  
 11  region        18249 non-null  object 
 12  Month         18249 non-null  int32  
 13  Quarter       18249 non-null  int32  
dtypes: float64(9), int32(2), int64(2), object(1)
memory usage: 1.8+ MB


In [7]:
print(df['Quarter'].unique())

[4 3 2 1]


In [8]:
df.drop(['4046','4225','4770'],axis=1,inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AveragePrice  18249 non-null  float64
 1   Total Volume  18249 non-null  float64
 2   Total Bags    18249 non-null  float64
 3   Small Bags    18249 non-null  float64
 4   Large Bags    18249 non-null  float64
 5   XLarge Bags   18249 non-null  float64
 6   type          18249 non-null  int64  
 7   year          18249 non-null  int64  
 8   region        18249 non-null  object 
 9   Month         18249 non-null  int32  
 10  Quarter       18249 non-null  int32  
dtypes: float64(6), int32(2), int64(2), object(1)
memory usage: 1.4+ MB


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AveragePrice  18249 non-null  float64
 1   Total Volume  18249 non-null  float64
 2   Total Bags    18249 non-null  float64
 3   Small Bags    18249 non-null  float64
 4   Large Bags    18249 non-null  float64
 5   XLarge Bags   18249 non-null  float64
 6   type          18249 non-null  int64  
 7   year          18249 non-null  int64  
 8   region        18249 non-null  object 
 9   Month         18249 non-null  int32  
 10  Quarter       18249 non-null  int32  
dtypes: float64(6), int32(2), int64(2), object(1)
memory usage: 1.4+ MB


In [11]:
df['type'].unique()

array([0, 1], dtype=int64)

## SCALING

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
df.columns

Index(['AveragePrice', 'Total Volume', 'Total Bags', 'Small Bags',
       'Large Bags', 'XLarge Bags', 'type', 'year', 'region', 'Month',
       'Quarter'],
      dtype='object')

In [15]:
cols = ['Total Volume' , 'Total Bags' , 'Small Bags']

In [16]:
df[cols] = scaler.fit_transform(df[cols])

In [17]:
df.head()

Unnamed: 0,AveragePrice,Total Volume,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Month,Quarter
0,1.33,-0.227716,-0.23417,-0.232647,93.25,0.0,0,2015,Albany,12,4
1,1.35,-0.230427,-0.23335,-0.231568,97.49,0.0,0,2015,Albany,12,4
2,0.93,-0.212085,-0.23473,-0.233399,103.14,0.0,0,2015,Albany,12,4
3,1.08,-0.223444,-0.237096,-0.236568,133.76,0.0,0,2015,Albany,12,4
4,1.28,-0.231538,-0.236718,-0.236154,197.69,0.0,0,2015,Albany,11,4


## GENERATING DUMMIES

In [18]:
df = pd.get_dummies(df,columns = ['region'] , drop_first = True)

## MODEL CREATING

In [19]:
X = df.drop('AveragePrice' , axis=1)

In [20]:
Y = df['AveragePrice']

In [21]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.3,random_state=42) ##SPLITTING DATA

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error , r2_score

In [23]:
rf_classifier = RandomForestRegressor(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=5,
    max_depth=10
)

In [24]:
model_rf = rf_classifier.fit(X_train ,Y_train) ## MODEL CREATING

In [25]:
model_rf

In [26]:
# with open("models/avocado_mode.pkl","wb") as model_file:
#     pickle.dump(model,model_file)

## TRAIN MODEL EVALUATION

In [27]:
y_pred_train = model_rf.predict(X_train)

In [28]:
r2_score_on_train = r2_score(Y_train,y_pred_train)
print(r2_score_on_train*100) ## FINAL R2_SCORE ON TRAIN

80.6609307924281


In [29]:
mse_of_train = mean_squared_error(Y_train,y_pred_train)
print(mse_of_train)

0.03142247393781756


## TEST  MODEL EVALUATION

In [30]:
y_pred_test = model_rf.predict(X_test)

In [31]:
r2_score_on_test = r2_score(Y_test , y_pred_test)
print(r2_score_on_test*100) ## FINAL R2_SCORE ON TEST

77.03289902074269


In [32]:
mse_of_test = mean_squared_error(Y_test,y_pred_test)
print(mse_of_test)

0.037054718567676434


## FINAL DATASET

In [33]:
print(df.head)
df['type'].isnull().sum()

<bound method NDFrame.head of        AveragePrice  Total Volume  Total Bags  Small Bags  Large Bags  \
0              1.33     -0.227716   -0.234170   -0.232647       93.25   
1              1.35     -0.230427   -0.233350   -0.231568       97.49   
2              0.93     -0.212085   -0.234730   -0.233399      103.14   
3              1.08     -0.223444   -0.237096   -0.236568      133.76   
4              1.28     -0.231538   -0.236718   -0.236154      197.69   
...             ...           ...         ...         ...         ...   
18244          1.63     -0.241373   -0.229301   -0.226665      431.85   
18245          1.71     -0.242296   -0.233594   -0.232196      324.80   
18246          1.87     -0.242331   -0.233463   -0.231644       42.31   
18247          1.93     -0.241625   -0.231866   -0.229543       50.00   
18248          1.62     -0.241253   -0.230807   -0.228111       26.01   

       XLarge Bags  type  year  Month  Quarter  ...  region_SouthCarolina  \
0              0

0

In [34]:
# Importing pickle
import pickle

In [35]:
pickle.dump(model_rf,open("model_rf.pkl","wb"))

In [36]:
regions = [
            'region_atlanta', 'region_baltimorewashington', 'region_boise',
            'region_boston', 'region_buffalorochester', 'region_california',
            'region_charlotte', 'region_chicago', 'region_cincinnatidayton',
            'region_columbus', 'region_dallasftworth', 'region_denver',
            'region_detroit', 'region_grandrapids', 'region_greatlakes',
            'region_harrisburgscranton', 'region_hartfordspringfield',
            'region_houston', 'region_indianapolis', 'region_jacksonville',
            'region_lasvegas', 'region_losangeles', 'region_louisville',
            'region_miamiftlauderdale', 'region_midsouth', 'region_nashville',
            'region_neworleansmobile', 'region_newyork', 'region_northeast',
            'region_northernnewengland', 'region_orlando', 'region_philadelphia',
            'region_phoenixtucson', 'region_pittsburgh', 'region_plains',
            'region_portland', 'region_raleighgreensboro', 'region_richmondnorfolk',
            'region_roanoke', 'region_sacramento', 'region_sandiego',
            'region_sanfrancisco', 'region_seattle', 'region_southcarolina',
            'region_southcentral', 'region_southeast', 'region_spokane',
            'region_stlouis', 'region_syracuse', 'region_tampa', 'region_totalus',
            'region_west', 'region_westtexnewmexico'
        ]

In [37]:
len(regions)

53