# Import Library and Mount Drive

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
from google.colab import drive 
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Liquor Sales Analysis/input

/content/drive/MyDrive/Liquor Sales Analysis/input


In [5]:
category_2017 = pd.read_csv('category_2017.csv')


In [6]:
category_2017.drop('Unnamed: 0',axis = 1, inplace = True)

In [35]:
X = pd.read_csv('train_AMERICAN VODKAS.csv')
y= pd.read_csv('test_AMERICAN VODKAS.csv')

In [36]:
X.drop("Unnamed: 0",axis = 1,inplace = True)
y.drop("Unnamed: 0", axis = 1,inplace = True)

X.drop("Category Name",axis = 1,inplace = True)


# Train a Linear Regression Model 

## Train Test Split 

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

## Preparing Data

In [39]:

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

## Linear Regresssion


In [15]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




LinearRegression(normalize=True)

In [16]:
print(lin_reg.intercept_)

[68.95597543]


In [17]:
pred = lin_reg.predict(X_test)

In [18]:
test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 30.405468949597754
MSE: 1863.7589117070197
RMSE: 43.17127414968221
R2 Square 0.2757487133471713
__________________________________
Train set evaluation:
_____________________________________
MAE: 30.54773497596955
MSE: 1904.7819951283254
RMSE: 43.643808210653724
R2 Square 0.25409968515070813
__________________________________


In [19]:
results_df = pd.DataFrame(data=[["Linear Regression", *evaluate(y_test, test_pred) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Linear Regression,30.405469,1863.758912,43.171274,0.275749,0.132067


## Ridge Regression

In [20]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 30.370800096937952
MSE: 1892.0305714904825
RMSE: 43.49747776010101
R2 Square 0.26476242866980926
__________________________________
Train set evaluation:
_____________________________________
MAE: 30.551830468689385
MSE: 1931.9178338630682
RMSE: 43.953587269562746
R2 Square 0.24347346613577014
__________________________________


In [21]:
results_df_2 = pd.DataFrame(data=[["Ridge Regression", *evaluate(y_test, test_pred) , cross_val(Ridge())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Linear Regression,30.405469,1863.758912,43.171274,0.275749,0.132067
1,Ridge Regression,30.3708,1892.030571,43.497478,0.264762,0.156381


## LASSO Regression

In [22]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1, 
              precompute=True, 
#               warm_start=True, 
              positive=True, 
              selection='random',
              random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 33.34755918851477
MSE: 2079.658804489407
RMSE: 45.60327624732029
R2 Square 0.1918506436163412
__________________________________
Train set evaluation:
_____________________________________
MAE: 33.39832062986834
MSE: 2096.619072817457
RMSE: 45.78885315027509
R2 Square 0.17897752575710713
__________________________________


In [23]:
results_df_2 = pd.DataFrame(data=[["Lasso Regression", *evaluate(y_test, test_pred) , cross_val(Lasso())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Linear Regression,30.405469,1863.758912,43.171274,0.275749,0.132067
1,Ridge Regression,30.3708,1892.030571,43.497478,0.264762,0.156381
2,Lasso Regression,33.347559,2079.658804,45.603276,0.191851,0.165492


## ANN

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

X_train = np.array(X_train[0:5000])
X_test = np.array(X_test[0:2000])
y_train = np.array(y_train[0:5000])
y_test = np.array(y_test[0:2000])

model = Sequential()

model.add(Dense(X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.6))

model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.2))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(1))

model.compile(optimizer=Adam(0.00001), loss='mse')

r = model.fit(X_train, y_train,
              validation_data=(X_test,y_test),
              batch_size=1,
              epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 35.244957092475886
MSE: 2112.3240557522217
RMSE: 45.96002671618264
R2 Square 0.19905334671453057
__________________________________
Train set evaluation:
_____________________________________
MAE: 34.67063257966614
MSE: 2129.187765873093
RMSE: 46.14312262811321
R2 Square 0.18041283863357693
__________________________________


In [30]:
results_df_2 = pd.DataFrame(data=[["Artficial Neural Network", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Linear Regression,30.405469,1863.758912,43.171274,0.275749,0.132067
1,Ridge Regression,30.3708,1892.030571,43.497478,0.264762,0.156381
2,Lasso Regression,33.347559,2079.658804,45.603276,0.191851,0.165492
3,Artficial Neural Network,35.244957,2112.324056,45.960027,0.199053,0.0


## Random Forest

In [40]:
from sklearn.ensemble import RandomForestRegressor
print(X_train.shape)
rf_reg = RandomForestRegressor(n_estimators=1000)
rf_reg.fit(X_train, y_train)

test_pred = rf_reg.predict(X_test)
train_pred = rf_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

(70000, 14)


  after removing the cwd from sys.path.


Test set evaluation:
_____________________________________
MAE: 7.715750449235679
MSE: 420.88493556144533
RMSE: 20.515480388268887
R2 Square 0.834434756388432
__________________________________
Train set evaluation:
_____________________________________
MAE: 2.848769110869128
MSE: 57.05218748609474
RMSE: 7.553289845232655
R2 Square 0.9777752502241582
__________________________________


In [45]:
results_df_2 = pd.DataFrame(data=[["Random Forest Regressor", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Linear Regression,30.405469,1863.758912,43.171274,0.275749,0.132067
1,Ridge Regression,30.3708,1892.030571,43.497478,0.264762,0.156381
2,Lasso Regression,33.347559,2079.658804,45.603276,0.191851,0.165492
3,Artficial Neural Network,35.244957,2112.324056,45.960027,0.199053,0.0
4,Random Forest Regressor,7.71575,420.884936,20.51548,0.834435,0.0


## SVM


In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear', C=10000, epsilon=0.001)
svm_reg.fit(X_train, y_train)

test_pred = svm_reg.predict(X_test)
train_pred = svm_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

  y = column_or_1d(y, warn=True)


In [None]:
results_df_2 = pd.DataFrame(data=[["SVM Regressor", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

# Trainning Model Random Forest top_count_category

In [10]:
for i in category_2017['Category Name'] :
  X = pd.read_csv('train_'+ i +'.csv')
  y = pd.read_csv('test_'+ i + '.csv')

  X.drop("Unnamed: 0",axis = 1,inplace = True)
  y.drop('Unnamed: 0', axis = 1,inplace = True)

  X.drop('Category Name',axis = 1,inplace = True)
  # split train test
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
  # print(  X_train.shape)
  # print(  y_train.shape)
  # preparing data
  pipeline = Pipeline([
    ('std_scalar', StandardScaler())
  ])

  X_train = pipeline.fit_transform(X_train)
  X_test = pipeline.transform(X_test)
  # trainning model 
  print('Trainning Model Data ' + i )
  rf_reg = RandomForestRegressor(n_estimators=1000)
  rf_reg.fit(X_train, y_train)

  test_pred = rf_reg.predict(X_test)
  train_pred = rf_reg.predict(X_train)

  print('Test set evaluation:\n_____________________________________')
  print_evaluate(y_test, test_pred)

  print('Train set evaluation:\n_____________________________________')
  print_evaluate(y_train, train_pred)


Trainning Model Data AMERICAN VODKAS




Test set evaluation:
_____________________________________
MAE: 7.69677008717337
MSE: 402.85460129057805
RMSE: 20.071238160377103
R2 Square 0.8431630252502424
__________________________________
Train set evaluation:
_____________________________________
MAE: 2.8359419844538745
MSE: 55.77554226756073
RMSE: 7.468302502413834
R2 Square 0.978175785573453
__________________________________
Trainning Model Data CANADIAN WHISKIES




Test set evaluation:
_____________________________________
MAE: 15.501190389103717
MSE: 1414.5821909711374
RMSE: 37.610931801420946
R2 Square 0.8222532501601189
__________________________________
Train set evaluation:
_____________________________________
MAE: 5.7475494839999905
MSE: 197.09038614202646
RMSE: 14.03888835136267
R2 Square 0.9756141639318328
__________________________________
Trainning Model Data AMERICAN VODKA




Test set evaluation:
_____________________________________
MAE: 7.757166106495554
MSE: 409.26115244098105
RMSE: 20.230203964393958
R2 Square 0.8310578832246343
__________________________________
Train set evaluation:
_____________________________________
MAE: 2.836875547329636
MSE: 54.99768987275624
RMSE: 7.416042736713175
R2 Square 0.9771699662790276
__________________________________
Trainning Model Data STRAIGHT BOURBON WHISKIES




Test set evaluation:
_____________________________________
MAE: 17.91340814546389
MSE: 1531.1799211278342
RMSE: 39.13029416101845
R2 Square 0.8104230248490176
__________________________________
Train set evaluation:
_____________________________________
MAE: 6.533599190282692
MSE: 206.35509851665836
RMSE: 14.365065211013084
R2 Square 0.9746444289360462
__________________________________
Trainning Model Data SPICED RUM




Test set evaluation:
_____________________________________
MAE: 9.717507415624569
MSE: 632.0768291231193
RMSE: 25.1411381827299
R2 Square 0.8873012007264673
__________________________________
Train set evaluation:
_____________________________________
MAE: 3.5708191250134425
MSE: 86.38604578850793
RMSE: 9.29440938352233
R2 Square 0.9845008460539428
__________________________________


FileNotFoundError: ignored