In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from  lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
df = pd.read_csv('Transformed_survey_results.csv')
df.head(3)

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,BSI
0,R00001,M,3,Working Professional,1,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,R00002,F,4,Working Professional,5,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0
2,R00003,F,1,Working Professional,5,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5,0


# spliting the data set for training

In [3]:
df.drop(['respondent_id'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,BSI
0,M,3,Working Professional,1,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,F,4,Working Professional,5,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0


In [4]:
X = df.drop(['price_range'],axis=1)
y = df['price_range']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature encoding

In [6]:
label_cols = ['age_group','income_levels','health_concerns','consume_frequency(weekly)','preferable_consumption_size']

le = LabelEncoder()

for col in label_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [7]:
X_train['consume_frequency(weekly)'].value_counts()

consume_frequency(weekly)
1    8810
2    7314
0    6343
Name: count, dtype: int64

In [8]:
remaining_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=remaining_cols,drop_first=True)
X_test = pd.get_dummies(X_test, columns=remaining_cols, drop_first=True)

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [9]:
remaining_cols

['gender',
 'occupation',
 'current_brand',
 'reasons_for_choosing_brands',
 'flavor_preference',
 'purchase_channel',
 'packaging_preference',
 'typical_consumption_situations']

In [10]:
X_train

Unnamed: 0,zone,income_levels,consume_frequency(weekly),preferable_consumption_size,awareness_of_other_brands,health_concerns,age_group,cf_ab_score,zas_score,BSI,...,current_brand_Newcomer,reasons_for_choosing_brands_Brand Reputation,reasons_for_choosing_brands_Price,reasons_for_choosing_brands_Quality,flavor_preference_Traditional,purchase_channel_Retail Store,packaging_preference_Premium,packaging_preference_Simple,typical_consumption_situations_Casual (eg. At home),typical_consumption_situations_Social (eg. Parties)
8374,2,2,0,2,3,1,0,0.25,4,0,...,True,False,False,False,True,False,False,True,False,True
26345,3,3,2,1,2,2,0,0.60,9,0,...,False,False,False,True,True,False,True,False,False,True
11256,3,0,1,2,2,2,0,0.50,0,1,...,True,False,True,False,True,False,False,True,False,False
9091,4,3,1,0,2,1,1,0.50,12,0,...,False,False,False,True,True,False,True,False,False,False
3369,4,3,0,0,3,2,1,0.25,12,0,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,3,5,0,2,3,0,4,0.25,15,0,...,False,False,False,True,False,True,True,False,True,False
5390,4,3,2,0,2,1,1,0.60,12,1,...,True,False,False,True,True,True,True,False,False,False
860,1,2,0,1,1,2,2,0.50,2,0,...,False,False,True,False,False,True,False,True,False,True
15795,2,2,1,1,1,0,1,0.67,4,1,...,True,False,True,False,False,False,False,False,False,True


In [11]:
label_encoder_y = LabelEncoder()
y_train = label_encoder_y.fit_transform(y_train)
y_test = label_encoder_y.transform(y_test)

In [None]:
list(le.classes_)

In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Logistic Model

In [13]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.7935638937107758
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.75      0.74      1930
           1       0.74      0.76      0.75      2223
           2       0.89      0.88      0.89      2430
           3       0.79      0.74      0.76       906

    accuracy                           0.79      7489
   macro avg       0.79      0.78      0.79      7489
weighted avg       0.79      0.79      0.79      7489



# model GaussianNB

In [14]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report", classification_report(y_test, y_pred_nb))

Accuracy: 0.5661637067699292
Classification Report               precision    recall  f1-score   support

           0       0.45      0.25      0.32      1930
           1       0.59      0.34      0.43      2223
           2       0.70      0.89      0.78      2430
           3       0.42      0.92      0.58       906

    accuracy                           0.57      7489
   macro avg       0.54      0.60      0.53      7489
weighted avg       0.57      0.57      0.53      7489



# model Support Vector Machine (SVM)

In [15]:
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report", classification_report(y_test, y_pred_svm))

Accuracy: 0.8129256242488984
Classification Report               precision    recall  f1-score   support

           0       0.75      0.77      0.76      1930
           1       0.76      0.79      0.78      2223
           2       0.91      0.90      0.90      2430
           3       0.82      0.74      0.78       906

    accuracy                           0.81      7489
   macro avg       0.81      0.80      0.80      7489
weighted avg       0.81      0.81      0.81      7489



# model Random Forest

In [16]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report", classification_report(y_test, y_pred_rf))

Accuracy: 0.8973160635598878
Classification Report               precision    recall  f1-score   support

           0       0.89      0.87      0.88      1930
           1       0.85      0.89      0.87      2223
           2       0.94      0.93      0.94      2430
           3       0.92      0.88      0.90       906

    accuracy                           0.90      7489
   macro avg       0.90      0.89      0.90      7489
weighted avg       0.90      0.90      0.90      7489



# model XGBOOST

In [17]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report", classification_report(y_test, y_pred_xgb))

Accuracy: 0.9245560154893844
Classification Report               precision    recall  f1-score   support

           0       0.92      0.91      0.91      1930
           1       0.90      0.91      0.91      2223
           2       0.96      0.95      0.95      2430
           3       0.92      0.92      0.92       906

    accuracy                           0.92      7489
   macro avg       0.92      0.92      0.92      7489
weighted avg       0.92      0.92      0.92      7489



# model Light GBM

In [18]:
model_lgbm = LGBMClassifier(force_col_wise=True)
model_lgbm.fit(X_train, y_train)
y_pred_lgbm = model_lgbm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("Classification Report", classification_report(y_test, y_pred_lgbm))

[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 22467, number of used features: 24
[LightGBM] [Info] Start training from score -1.343386
[LightGBM] [Info] Start training from score -1.228925
[LightGBM] [Info] Start training from score -1.126779
[LightGBM] [Info] Start training from score -2.100810
Accuracy: 0.9266924823073842
Classification Report               precision    recall  f1-score   support

           0       0.92      0.90      0.91      1930
           1       0.90      0.92      0.91      2223
           2       0.96      0.96      0.96      2430
           3       0.92      0.92      0.92       906

    accuracy                           0.93      7489
   macro avg       0.92      0.92      0.92      7489
weighted avg       0.93      0.93      0.93      7489



In [19]:
joblib.dump(model_lgbm, 'final_beverage_model.pkl')

['final_beverage_model.pkl']

In [20]:
joblib.dump(X_train.columns.tolist(), 'expected_columns.pkl')

['expected_columns.pkl']

In [21]:
joblib.dump(label_encoder_y, 'label_encoder_y.pkl')

['label_encoder_y.pkl']

#### best performing model is LightGBM

# ML Flow

In [None]:
models = [
    (
        "Logistic Regression",
        model_lr,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'GaussianNB',
        model_nb,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'Support Vector Machine',
        model_svm,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'Random Forest',
        model_rf,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'XGBOOST',
        model_xgb,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'LightGBM',
        model_lgbm,
        (X_train, y_train),
        (X_test, y_test)
    )
]

In [None]:
reports = []
for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [None]:
reports

In [None]:
import mlflow

In [None]:
import dagshub
dagshub.init(repo_owner='virajsoni191',
             repo_name='project-codex',
             mlflow=True)

mlflow.set_experiment('Beverage_Price_Prediction')


for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param('model_name', model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('recall_class_2', report['2']['recall'])
        mlflow.log_metric('precision_class_2', report['2']['precision'])
        mlflow.log_metric('recall_class_3', report['3']['recall'])
        mlflow.log_metric('precision_class_3', report['3']['precision'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

        if 'XGBOOST' in model_name:
            mlflow.xgboost.log_model(model, 'model')
        elif 'LightGBM' in model_name:
            mlflow.lightgbm.log_model(model, 'model')
        else:
            mlflow.sklearn.log_model(model, 'model')