# Trying to Predict the Price based on the various features

# Importing the Usual Library

In [3]:
#importing library 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR,SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings(action='ignore')

# Loading the Dataset

In [4]:
df=pd.read_csv('/kaggle/input/vegetable-market/Vegetable_market.csv')
#loading the dataset
df
df_test=pd.read_csv('/kaggle/input/vegetable-market/test.csv')

In [5]:
df

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


# Checking for Preliminary Information about the dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


In [7]:
df['Vegetable condition'].unique()

array(['fresh', 'scrap', 'avarage', 'scarp'], dtype=object)

# Preprocessing the Dataset

In [23]:
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column])
    #concating the dummies column
    df=pd.concat([df,dummies],axis=1)
    #dropping the original column
    df=df.drop(column,axis=1)
    return df

In [30]:
def preprocess_inputs(df):
    df=df.copy()
    #Binary Encoding
    df['Deasaster Happen in last 3month']=df['Deasaster Happen in last 3month'].apply(lambda x:1 if x=='yes' else 0)
    #replacing the scarp with scrap with replace function in pandas
    df['Vegetable condition']=df['Vegetable condition'].replace({'scarp':'scrap'})
    ordinal_month={'jan':1, 'apr':3, 'july':6, 'sept':8, 'oct':9, 'dec':12, 'may':4, 'aug':7, 'june':5,
       ' ':np.NaN, 'march':2}
    df['Month']=df['Month'].replace(ordinal_month)
    #replacing the month column with mode of the column
    df['Month']=df['Month'].fillna(df['Month'].mode()[0])
    df=onehot_encode(df,'Vegetable')
    df=onehot_encode(df,'Season')
    df=onehot_encode(df,'Vegetable condition')
    y=df['Price per kg']
    x=df.drop('Price per kg',axis=1)
    #scaling the feature dataset
    scaler=StandardScaler()
    x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    
    
    
    
    return x_train,x_test,y_train,y_test

In [31]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(84, 28)
(37, 28)
(84,)
(37,)


In [32]:
x_train

Unnamed: 0,Month,Temp,Deasaster Happen in last 3month,Bitter gourd,Raddish,brinjal,cabage,califlower,chilly,cucumber,...,radish,tomato,autumn,monsoon,spring,summer,winter,avarage,fresh,scrap
58,-0.083044,0.765841,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,-0.458258,-0.159448,1.506742,-0.959497,-0.444994,0.650791,-0.390360
43,-0.752928,0.227081,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,2.182179,-0.159448,-0.663684,-0.959497,-0.444994,-1.536591,2.561738
30,0.921784,0.227081,1.667708,-0.207614,-0.228416,-0.228416,3.758324,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,2.182179,-0.159448,-0.663684,-0.959497,-0.444994,-1.536591,2.561738
76,0.921784,0.658089,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,5.408327,...,-0.1849,-0.331801,-0.129641,2.182179,-0.159448,-0.663684,-0.959497,2.247221,-1.536591,-0.390360
98,0.921784,0.550337,1.667708,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,2.182179,-0.159448,-0.663684,-0.959497,-0.444994,0.650791,-0.390360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,-0.752928,1.089097,1.667708,-0.207614,-0.228416,-0.228416,3.758324,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,-0.458258,-0.159448,1.506742,-0.959497,2.247221,-1.536591,-0.390360
50,-0.752928,-1.065944,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,-0.458258,-0.159448,-0.663684,1.042213,-0.444994,0.650791,-0.390360
66,0.251899,1.735609,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,-0.207614,-0.184900,...,-0.1849,-0.331801,-0.129641,-0.458258,-0.159448,1.506742,-0.959497,-0.444994,0.650791,-0.390360
113,-0.752928,-1.065944,-0.599625,-0.207614,-0.228416,-0.228416,-0.266076,-0.266076,4.816638,-0.184900,...,-0.1849,-0.331801,-0.129641,-0.458258,-0.159448,-0.663684,1.042213,-0.444994,0.650791,-0.390360


# Training the Models

In [33]:
models={
    'Linear Regression':LinearRegression(),
    'Linear Regression (L2 Regularization)':Ridge(),
    'Linear Regression (L1 Regularization)':Lasso(),
    'K-Nearest Neighbors':KNeighborsRegressor(),
    'Neural Network':MLPRegressor(),
    'Support Vector Machine (Linear Kernel)':LinearSVR(),
    'SVM (Non Linear Kernel)': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest':RandomForestRegressor(),
    'Gradient Boosting':GradientBoostingRegressor(),
    'XGBoost':XGBRegressor(),
    'LightGBM':LGBMRegressor(),
    'CatBoost':CatBoostRegressor(verbose=0)
}

In [34]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))

Linear Regression
0.4510004194477537
Linear Regression (L2 Regularization)
0.4628650926295138
Linear Regression (L1 Regularization)
0.5047814864549266
K-Nearest Neighbors
0.4767529773903606
Neural Network
-0.2572732316480124
Support Vector Machine (Linear Kernel)
0.3060746147061837
SVM (Non Linear Kernel)
0.013446782708270044
Decision Tree
0.7625259334060087
Random Forest
0.7492645550430723
Gradient Boosting
0.7574582453828667
XGBoost
0.7601782361155701
LightGBM
0.10538164901736169
CatBoost
0.6559705686252222


In [11]:
x['Vegetable condition'].unique()

array(['fresh', 'scrap', 'avarage'], dtype=object)

In [19]:
x['Month'].unique()

array([ 1.,  3.,  6.,  8.,  9., 12.,  4.,  7.,  5.,  2.])