In [208]:
#import numpy
import numpy as np
#import pandas to manipulate tabular data
import pandas as pd
#train_test_split
from sklearn.model_selection import train_test_split
#StandardScaler
from sklearn.preprocessing import StandardScaler
#linear_model
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#neighbors
from sklearn.neighbors import KNeighborsRegressor
#neural_network
from sklearn.neural_network import MLPRegressor
#Support Vector Machine
from sklearn.svm import LinearSVR,SVR
#tree based models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
#XGBoost
from xgboost import XGBRegressor
#lightgbm
from lightgbm import LGBMRegressor
#catboost
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings(action='ignore')

# Loading the Dataset

In [209]:
df=pd.read_csv('/kaggle/input/vegetable-market/Vegetable_market.csv')
#loading the dataset
df

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [210]:
df.isna().sum()

Vegetable                          0
Season                             0
Month                              0
Temp                               0
Deasaster Happen in last 3month    0
Vegetable condition                0
Price per kg                       0
dtype: int64

# Checking the Preliminary Information

In [211]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


# Creating the Preprocessing Function

# Checking for unique value in each column

In [212]:
{column:len(df[column].unique()) for column in df.columns}

{'Vegetable': 17,
 'Season': 5,
 'Month': 11,
 'Temp': 17,
 'Deasaster Happen in last 3month': 2,
 'Vegetable condition': 4,
 'Price per kg': 43}

In [213]:
df['Vegetable condition'].unique()

array(['fresh', 'scrap', 'avarage', 'scarp'], dtype=object)

In [214]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [215]:
def preprocess_inputs(df):
    df=df.copy()
    
    
    #encoding the binary column
    df['Deasaster Happen in last 3month']=df['Deasaster Happen in last 3month'].apply(lambda x:1 if x=='yes' else 0)
    
    #renaming the Vegetable condition column
    df['Vegetable condition']=df['Vegetable condition'].replace({'scarp':'scrap','avarage':'average'})
    #encoding the month column
    month_encoding={'jan':1, 'apr':4, 'july':7, 'sept':9, 'oct':10, 'dec':12, 'may':5, 'aug':8, 'june':6, 'march':4}
    df['Month']=df['Month'].replace(month_encoding)
    df['Month']=df['Month'].replace({' ':np.NaN})
    df['Month']=df['Month'].fillna(df['Month'].mode()[0])
    df['Month']=df['Month'].astype(np.int)
    #encoding the remaining categorical column
    onehot_column=['Vegetable','Season','Vegetable condition']
    df=onehot_encode(df,onehot_column)
    #splitting the dataset into target and feature columns
    y=df['Price per kg']
    x=df.drop('Price per kg',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)
    
    return x_train,x_test,y_train,y_test

In [216]:
#checking for missing values
x.isna().sum()

Month                              3
Temp                               0
Deasaster Happen in last 3month    0
Vegetable_Bitter gourd             0
Vegetable_Raddish                  0
Vegetable_brinjal                  0
Vegetable_cabage                   0
Vegetable_califlower               0
Vegetable_chilly                   0
Vegetable_cucumber                 0
Vegetable_garlic                   0
Vegetable_ginger                   0
Vegetable_okra                     0
Vegetable_onion                    0
Vegetable_peas                     0
Vegetable_pointed grourd           0
Vegetable_potato                   0
Vegetable_pumkin                   0
Vegetable_radish                   0
Vegetable_tomato                   0
Season_autumn                      0
Season_monsoon                     0
Season_spring                      0
Season_summer                      0
Season_winter                      0
Vegetable condition_average        0
Vegetable condition_fresh          0
V

In [217]:
x['Month'].unique()

array([ 1.,  4.,  7.,  9., 10., 12.,  5.,  8.,  6., nan])

In [218]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 28 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Month                            118 non-null    float64
 1   Temp                             121 non-null    int64  
 2   Deasaster Happen in last 3month  121 non-null    int64  
 3   Vegetable_Bitter gourd           121 non-null    uint8  
 4   Vegetable_Raddish                121 non-null    uint8  
 5   Vegetable_brinjal                121 non-null    uint8  
 6   Vegetable_cabage                 121 non-null    uint8  
 7   Vegetable_califlower             121 non-null    uint8  
 8   Vegetable_chilly                 121 non-null    uint8  
 9   Vegetable_cucumber               121 non-null    uint8  
 10  Vegetable_garlic                 121 non-null    uint8  
 11  Vegetable_ginger                 121 non-null    uint8  
 12  Vegetable_okra        

In [219]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(84, 28)
(37, 28)
(84,)
(37,)


# # Training the Model

In [220]:
models={
    'Linear Regression':LinearRegression(),
    'Linear Regression(L2 Regularization)':Ridge(),
    'Linear Regression(L1 Regularization)':Lasso(),
    'K-Nearest Neighbors':KNeighborsRegressor(),
    'Neural Network':MLPRegressor(),
    'Support Vector Machine (Linear Kernel)':LinearSVR(),
    'Support Vector Machine (RBF Kernel)':SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost':XGBRegressor(),
    'LightGBM':LGBMRegressor(),
    'CatBoost':CatBoostRegressor(verbose=0)
}

In [221]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))

Linear Regression
0.5119889372622126
Linear Regression(L2 Regularization)
0.5132251643463979
Linear Regression(L1 Regularization)
0.5408902221930254
K-Nearest Neighbors
0.23010854464581498
Neural Network
-0.4522093920479888
Support Vector Machine (Linear Kernel)
0.046073355077541955
Support Vector Machine (RBF Kernel)
-0.15733156677926008
Decision Tree
0.35913274382476335
Random Forest
0.6471516422081027
Gradient Boosting
0.620638523274462
XGBoost
0.12305482472973994
LightGBM
-0.08233421664270146
CatBoost
0.4938761728853075


In [230]:
x_test[]

Unnamed: 0,Month,Temp,Deasaster Happen in last 3month,Vegetable_Bitter gourd,Vegetable_Raddish,Vegetable_brinjal,Vegetable_cabage,Vegetable_califlower,Vegetable_chilly,Vegetable_cucumber,...,Vegetable_radish,Vegetable_tomato,Season_autumn,Season_monsoon,Season_spring,Season_summer,Season_winter,Vegetable condition_average,Vegetable condition_fresh,Vegetable condition_scrap
0,-0.901587,-1.053837,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
52,-0.014087,0.534482,1.732051,-0.251577,-0.19245,-0.251577,-0.301511,9.110434,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,1.0,-0.688247,-0.953463,-0.4279,0.650945,-0.408248
28,0.873413,0.322706,1.732051,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,3.316625,-0.156174,2.144761,0.0,-0.688247,-0.953463,-0.4279,0.650945,-0.408248
6,-0.901587,-1.053837,-0.57735,-0.251577,5.196152,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
113,-0.901587,-1.053837,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,4.472136,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
87,-0.901587,-1.053837,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
15,-0.901587,-1.053837,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
78,-0.901587,-1.053837,-0.57735,-0.251577,5.196152,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,-0.688247,1.048809,-0.4279,0.650945,-0.408248
103,-0.901587,0.216818,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,2.144761,0.0,-0.688247,-0.953463,-0.4279,-1.536229,2.44949
91,-0.014087,0.852145,-0.57735,-0.251577,-0.19245,-0.251577,-0.301511,-0.109764,-0.223607,-0.19245,...,-0.19245,-0.301511,-0.156174,-0.466252,0.0,1.452966,-0.953463,-0.4279,0.650945,-0.408248


In [222]:
df['Month'].unique()

array(['jan', 'apr', 'july', 'sept', 'oct', 'dec', 'may', 'aug', 'june',
       ' ', 'march'], dtype=object)

# Checking for object columns

In [223]:
df.select_dtypes('object').columns

Index(['Vegetable', 'Season', 'Month', 'Deasaster Happen in last 3month',
       'Vegetable condition'],
      dtype='object')

In [224]:
y.isna().sum()

0