# Trying to Predict the Cola Demand in Particular City

In [108]:
#importing the libraries
import numpy as np
import pandas as pd
#visualization library
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeClassifier
#MLP Classifier
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Loading the Dataset

In [74]:
train_df=pd.read_csv('/kaggle/input/predict-demand/train.csv')
test_df=pd.read_csv('/kaggle/input/predict-demand/test.csv')

# Showing the Dataset

In [75]:
train_df

Unnamed: 0,id,date,city,lat,long,pop,shop,brand,container,capacity,price,quantity
0,0.0,31/01/12,Athens,37.97945,23.71622,672130.0,shop_1,kinder-cola,glass,500ml,0.96,13280.0
1,1.0,31/01/12,Athens,37.97945,23.71622,672130.0,shop_1,kinder-cola,plastic,1.5lt,2.86,6727.0
2,2.0,31/01/12,Athens,37.97945,23.71622,672130.0,shop_1,kinder-cola,can,330ml,0.87,9848.0
3,3.0,31/01/12,Athens,37.97945,23.71622,672130.0,shop_1,adult-cola,glass,500ml,1.00,20050.0
4,4.0,31/01/12,Athens,37.97945,23.71622,672130.0,shop_1,adult-cola,can,330ml,0.39,25696.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7555,,,,,,,,,,,,
7556,,,,,,,,,,,,
7557,,,,,,,,,,,,
7558,,,,,,,,,,,,


# Getting the Preliminary Information about the Dataset

In [76]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7560 entries, 0 to 7559
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         6480 non-null   float64
 1   date       6480 non-null   object 
 2   city       6480 non-null   object 
 3   lat        6429 non-null   float64
 4   long       6434 non-null   float64
 5   pop        6480 non-null   float64
 6   shop       6480 non-null   object 
 7   brand      6480 non-null   object 
 8   container  6464 non-null   object 
 9   capacity   6465 non-null   object 
 10  price      6480 non-null   float64
 11  quantity   6480 non-null   float64
dtypes: float64(6), object(6)
memory usage: 708.9+ KB


# #checking the null values in the dataset

In [77]:
train_df.isna().sum()

id           1080
date         1080
city         1080
lat          1131
long         1126
pop          1080
shop         1080
brand        1080
container    1096
capacity     1095
price        1080
quantity     1080
dtype: int64

In [78]:
rows_to_drops=train_df[train_df['id'].isna()].index

In [79]:
train_df=train_df.drop(rows_to_drops,axis=0).reset_index(drop=True)

# Again Checking the Missing Values

In [80]:
train_df.isna().sum()

id            0
date          0
city          0
lat          51
long         46
pop           0
shop          0
brand         0
container    16
capacity     15
price         0
quantity      0
dtype: int64

# Filling the Missing Value of Lat and Long with mean of that column

In [81]:
for column in ['lat','long']:
    train_df[column]=train_df[column].fillna(train_df[column].mean())

# Again Checking the Missing Values

In [82]:
train_df.isna().sum()

id            0
date          0
city          0
lat           0
long          0
pop           0
shop          0
brand         0
container    16
capacity     15
price         0
quantity      0
dtype: int64

# Checking the Mode of Object Column

In [83]:
for column in ['container','capacity']:
    print(column+':'+train_df[column].mode()[0])

container:plastic
capacity:1.5lt


# Filling the Missing Value of Object Column with Mode of that column

In [84]:
for column in ['container','capacity']:
    train_df[column]=train_df[column].fillna(train_df[column].mode()[0])

# Again Checking the Missing Value in the Dataset

In [85]:
train_df.isna().sum()

id           0
date         0
city         0
lat          0
long         0
pop          0
shop         0
brand        0
container    0
capacity     0
price        0
quantity     0
dtype: int64

In [86]:
train_df=train_df.drop('id',axis=1)

In [87]:
{column:list(train_df[column].unique()) for column in train_df.columns if train_df[column].dtype=='object'}

{'date': ['31/01/12',
  '29/02/12',
  '31/03/12',
  '30/04/12',
  '31/05/12',
  '30/06/12',
  '31/07/12',
  '31/08/12',
  '30/09/12',
  '31/10/12',
  '30/11/12',
  '31/12/12',
  '31/01/13',
  '28/02/13',
  '31/03/13',
  '30/04/13',
  '31/05/13',
  '30/06/13',
  '31/07/13',
  '31/08/13',
  '30/09/13',
  '31/10/13',
  '30/11/13',
  '31/12/13',
  '31/01/14',
  '28/02/14',
  '31/03/14',
  '30/04/14',
  '31/05/14',
  '30/06/14',
  '31/07/14',
  '31/08/14',
  '30/09/14',
  '31/10/14',
  '30/11/14',
  '31/12/14',
  '31/01/15',
  '28/02/15',
  '31/03/15',
  '30/04/15',
  '31/05/15',
  '30/06/15',
  '31/07/15',
  '31/08/15',
  '30/09/15',
  '31/10/15',
  '30/11/15',
  '31/12/15',
  '31/01/16',
  '29/02/16',
  '31/03/16',
  '30/04/16',
  '31/05/16',
  '30/06/16',
  '31/07/16',
  '31/08/16',
  '30/09/16',
  '31/10/16',
  '30/11/16',
  '31/12/16',
  '31/01/17',
  '28/02/17',
  '31/03/17',
  '30/04/17',
  '31/05/17',
  '30/06/17',
  '31/07/17',
  '31/08/17',
  '30/09/17',
  '31/10/17',
  '30/11/17'

# Encoding the Object Columns

In [88]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        #concating
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [89]:
onehot_columns=['city','brand','container','shop']
train_df=onehot_encode(train_df,onehot_columns)

In [90]:
train_df['capacity'].unique()

array(['500ml', '1.5lt', '330ml'], dtype=object)

In [91]:
container={'1.5lt':1,
         '500ml':2,
         '330ml':3}

In [92]:
train_df['capacity']=train_df['capacity'].replace(container)

In [93]:
train_df

Unnamed: 0,date,lat,long,pop,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,...,brand_orange-power,container_can,container_glass,container_plastic,shop_shop_1,shop_shop_2,shop_shop_3,shop_shop_4,shop_shop_5,shop_shop_6
0,31/01/12,37.97945,23.71622,672130.0,2,0.96,13280.0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
1,31/01/12,37.97945,23.71622,672130.0,1,2.86,6727.0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
2,31/01/12,37.97945,23.71622,672130.0,3,0.87,9848.0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
3,31/01/12,37.97945,23.71622,672130.0,2,1.00,20050.0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
4,31/01/12,37.97945,23.71622,672130.0,3,0.39,25696.0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,31/12/17,37.96245,23.68708,665871.0,1,1.02,33201.0,1,0,0,...,1,0,0,1,0,0,1,0,0,0
6476,31/12/17,39.63689,22.41761,144302.0,3,0.47,46971.0,0,0,1,...,1,1,0,0,0,0,0,0,1,0
6477,31/12/17,38.24444,21.73444,168501.0,2,1.02,47708.0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6478,31/12/17,40.64361,22.93086,353001.0,1,1.34,27115.0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [94]:
train_df['date']=pd.to_datetime(train_df['date'])

# Extracting the Month and Year from Date Column

In [95]:
train_df['Year']=train_df['date'].dt.year
train_df['Month']=train_df['date'].dt.month

# Dropping the Original Date Column

In [96]:
train_df=train_df.drop('date',axis=1)

In [97]:
train_df

Unnamed: 0,lat,long,pop,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,city_Patra,...,container_glass,container_plastic,shop_shop_1,shop_shop_2,shop_shop_3,shop_shop_4,shop_shop_5,shop_shop_6,Year,Month
0,37.97945,23.71622,672130.0,2,0.96,13280.0,1,0,0,0,...,1,0,1,0,0,0,0,0,2012,1
1,37.97945,23.71622,672130.0,1,2.86,6727.0,1,0,0,0,...,0,1,1,0,0,0,0,0,2012,1
2,37.97945,23.71622,672130.0,3,0.87,9848.0,1,0,0,0,...,0,0,1,0,0,0,0,0,2012,1
3,37.97945,23.71622,672130.0,2,1.00,20050.0,1,0,0,0,...,1,0,1,0,0,0,0,0,2012,1
4,37.97945,23.71622,672130.0,3,0.39,25696.0,1,0,0,0,...,0,0,1,0,0,0,0,0,2012,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,37.96245,23.68708,665871.0,1,1.02,33201.0,1,0,0,0,...,0,1,0,0,1,0,0,0,2017,12
6476,39.63689,22.41761,144302.0,3,0.47,46971.0,0,0,1,0,...,0,0,0,0,0,0,1,0,2017,12
6477,38.24444,21.73444,168501.0,2,1.02,47708.0,0,0,0,1,...,1,0,0,0,0,0,0,1,2017,12
6478,40.64361,22.93086,353001.0,1,1.34,27115.0,0,0,0,0,...,0,1,0,0,0,1,0,0,2017,12


In [98]:
{column:list(train_df[column].unique()) for column in train_df.columns if train_df[column].dtype=='object'}

{}

# Splitting between target and feature columns

In [101]:
y=train_df['quantity']
x=train_df.drop('quantity',axis=1)

# Scaling the feature Dataset and StandardScaler

In [103]:
x

Unnamed: 0,lat,long,pop,capacity,price,city_Athens,city_Irakleion,city_Larisa,city_Patra,city_Thessaloniki,...,container_glass,container_plastic,shop_shop_1,shop_shop_2,shop_shop_3,shop_shop_4,shop_shop_5,shop_shop_6,Year,Month
0,37.97945,23.71622,672130.0,2,0.96,1,0,0,0,0,...,1,0,1,0,0,0,0,0,2012,1
1,37.97945,23.71622,672130.0,1,2.86,1,0,0,0,0,...,0,1,1,0,0,0,0,0,2012,1
2,37.97945,23.71622,672130.0,3,0.87,1,0,0,0,0,...,0,0,1,0,0,0,0,0,2012,1
3,37.97945,23.71622,672130.0,2,1.00,1,0,0,0,0,...,1,0,1,0,0,0,0,0,2012,1
4,37.97945,23.71622,672130.0,3,0.39,1,0,0,0,0,...,0,0,1,0,0,0,0,0,2012,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,37.96245,23.68708,665871.0,1,1.02,1,0,0,0,0,...,0,1,0,0,1,0,0,0,2017,12
6476,39.63689,22.41761,144302.0,3,0.47,0,0,1,0,0,...,0,0,0,0,0,0,1,0,2017,12
6477,38.24444,21.73444,168501.0,2,1.02,0,0,0,1,0,...,1,0,0,0,0,0,0,1,2017,12
6478,40.64361,22.93086,353001.0,1,1.34,0,0,0,0,1,...,0,1,0,0,0,1,0,0,2017,12


In [105]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

# Splitting the Feature and Target Dataset into train and test set

In [109]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

# Checking the Shape of the train and test set

In [110]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4536, 26)
(1944, 26)
(4536,)
(1944,)


# Storing the Models to a Dictionary

In [111]:
models={'KN':KNeighborsRegressor(),
'LR':LinearRegression(),
'Rid':Ridge(),
'LSVR':LinearSVR(),
'SVR':SVR(),
'DT':DecisionTreeClassifier(),
'MLP':MLPRegressor(),
'RF':RandomForestRegressor(),
'GB':GradientBoostingRegressor()}


# Checking the Accuracy of Each of the Model

In [112]:
for name,model in models.items():
    print(name)
    model.fit(x_train,y_train)
    print(model.score(x_test,y_test))
    

KN
0.749808232234464
LR
0.5920033704406988
Rid
0.5920311000952727
LSVR
-1.783893061620228
SVR
-0.027579690787660427
DT
0.0
MLP




-0.5746576072954659
RF
0.9263886435427929
GB
0.902603853245756


In [113]:
y_test

810      9719.0
6236    20361.0
1000    13161.0
2949    42332.0
6097    27942.0
         ...   
1301    28184.0
2305    28905.0
4593    29011.0
3322     9949.0
1358    49301.0
Name: quantity, Length: 1944, dtype: float64

In [115]:
model=RandomForestRegressor()
model.fit(x_train,y_train)
model.predict(x_test)

array([11005.92, 20100.02, 14806.76, ..., 34720.11, 10248.35, 50668.71])