In [44]:
# import libraries
from cryptography.fernet import Fernet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import copy
import warnings

In [2]:
# Open the key file
with open('filekey.key', 'rb') as f:
  key = f.read()

In [3]:
# Store the key values
fernet = Fernet(key)

In [4]:
# open the dataset
with open('nutriboost.csv', 'rb') as f:
  data = f.read()

In [5]:
# Start Decrypting data
decrypted = fernet.decrypt(data)

In [7]:
# Overwrite the data with decryption
with open('nutriboost.csv', 'wb') as f:
  f.write(decrypted)

In [8]:
# load the dataset
df = pd.read_csv('nutriboost.csv')
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income_Level,Health_Conscious_Score,Current_Beverage_Preference,Purchase_Intention,Willingness_To_Pay,Feature_Preference_Taste,Feature_Preference_EnergyBoost,...,Expected_Repeat_Purchase_Rate,Open_Ended_Feedback,Preferred_Channel,Proximity_to_Store,Channel_Satisfaction,Awareness_Source,Promo_Response,Ad_Recall_Score,Coupon_Usage(%),Brand_Engagement
0,1,46,Male,Medium,8,Energy Drinks,0,83,1,3,...,0.85,Not sure if I would switch from my current brand.,Gym,1.27,2,Social Media,No,5,97,Medium
1,2,32,Male,High,6,,1,129,5,3,...,0.55,Great taste but packaging could be improved.,Gym,3.41,5,Social Media,No,3,92,Medium
2,3,25,Female,Low,5,Water,1,43,4,5,...,0.43,Price is a concern for students like me.,Gym,3.8,4,TV,Yes,1,23,Low
3,4,38,Other,Low,6,Water,0,129,3,5,...,0.72,Not sure if I would switch from my current brand.,Convenience Store,2.98,1,Social Media,Yes,4,23,Medium
4,5,36,Other,Low,2,Juices,0,143,5,5,...,0.23,I love the idea of a healthy energy drink.,Cafe,2.36,3,Word of Mouth,No,5,3,Low


In [9]:
# shape
df.shape

(500, 23)

In [10]:
# check duplicate records
df.duplicated().sum()

np.int64(0)

In [11]:
# check for missing values
df.isna().sum()

Unnamed: 0,0
CustomerID,0
Age,0
Gender,0
Income_Level,0
Health_Conscious_Score,0
Current_Beverage_Preference,92
Purchase_Intention,0
Willingness_To_Pay,0
Feature_Preference_Taste,0
Feature_Preference_EnergyBoost,0


In [12]:
# Create the copy of the dataset
df_copy = df.copy(deep = True)
df_copy.head()

Unnamed: 0,CustomerID,Age,Gender,Income_Level,Health_Conscious_Score,Current_Beverage_Preference,Purchase_Intention,Willingness_To_Pay,Feature_Preference_Taste,Feature_Preference_EnergyBoost,...,Expected_Repeat_Purchase_Rate,Open_Ended_Feedback,Preferred_Channel,Proximity_to_Store,Channel_Satisfaction,Awareness_Source,Promo_Response,Ad_Recall_Score,Coupon_Usage(%),Brand_Engagement
0,1,46,Male,Medium,8,Energy Drinks,0,83,1,3,...,0.85,Not sure if I would switch from my current brand.,Gym,1.27,2,Social Media,No,5,97,Medium
1,2,32,Male,High,6,,1,129,5,3,...,0.55,Great taste but packaging could be improved.,Gym,3.41,5,Social Media,No,3,92,Medium
2,3,25,Female,Low,5,Water,1,43,4,5,...,0.43,Price is a concern for students like me.,Gym,3.8,4,TV,Yes,1,23,Low
3,4,38,Other,Low,6,Water,0,129,3,5,...,0.72,Not sure if I would switch from my current brand.,Convenience Store,2.98,1,Social Media,Yes,4,23,Medium
4,5,36,Other,Low,2,Juices,0,143,5,5,...,0.23,I love the idea of a healthy energy drink.,Cafe,2.36,3,Word of Mouth,No,5,3,Low


In [13]:
# Check the unique values in Current_Beverage_Preference
df_copy['Current_Beverage_Preference'].unique()

array(['Energy Drinks', nan, 'Water', 'Juices', 'Soft Drinks'],
      dtype=object)

In [14]:
# Check the mode of the data
df_copy['Current_Beverage_Preference'].mode()

Unnamed: 0,Current_Beverage_Preference
0,Water


In [15]:
# Replace the missing values
df_copy['Current_Beverage_Preference'].fillna('Water', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['Current_Beverage_Preference'].fillna('Water', inplace = True)


In [16]:
# Check for the missing values
df_copy.isna().sum()

Unnamed: 0,0
CustomerID,0
Age,0
Gender,0
Income_Level,0
Health_Conscious_Score,0
Current_Beverage_Preference,0
Purchase_Intention,0
Willingness_To_Pay,0
Feature_Preference_Taste,0
Feature_Preference_EnergyBoost,0


In [18]:
# Idea Screening
purchase_intent_rate = df_copy['Purchase_Intention'].mean() * 100
print("The purchase intention rate is " ,purchase_intent_rate, "%")

The purchase intention rate is  62.4 %


In [19]:
# Check the column name
df_copy.columns

Index(['CustomerID', 'Age', 'Gender', 'Income_Level', 'Health_Conscious_Score',
       'Current_Beverage_Preference', 'Purchase_Intention',
       'Willingness_To_Pay', 'Feature_Preference_Taste',
       'Feature_Preference_EnergyBoost', 'Feature_Preference_Packaging',
       'Feature_Preference_Price', 'Feature_Preference_Availability',
       'Expected_Repeat_Purchase_Rate', 'Open_Ended_Feedback',
       'Preferred_Channel', 'Proximity_to_Store', 'Channel_Satisfaction',
       'Awareness_Source', 'Promo_Response', 'Ad_Recall_Score',
       'Coupon_Usage(%)', 'Brand_Engagement'],
      dtype='object')

In [23]:
# Concept Testing
concept = df_copy[['Feature_Preference_Taste',
       'Feature_Preference_EnergyBoost', 'Feature_Preference_Packaging',
       'Feature_Preference_Price', 'Feature_Preference_Availability']].mean().rename('Feature Means')

round(concept,2)


Unnamed: 0,Feature Means
Feature_Preference_Taste,2.97
Feature_Preference_EnergyBoost,3.01
Feature_Preference_Packaging,3.12
Feature_Preference_Price,3.0
Feature_Preference_Availability,2.92


In [25]:
# Marketing Startegy
income_levels = df_copy.groupby('Income_Level')['Purchase_Intention'].mean().rename('Income Level Means') * 100
round(income_levels,2)

Unnamed: 0_level_0,Income Level Means
Income_Level,Unnamed: 1_level_1
High,63.89
Low,61.15
Medium,62.55


In [26]:
# Business Analysis
avg_wtp = df_copy['Willingness_To_Pay'].mean()
round(avg_wtp,2)

np.float64(96.96)

In [28]:
# Logistic Regression Model (x is independent variable and y is depenedent variable)
x = df_copy[['Age', 'Health_Conscious_Score', 'Feature_Preference_Taste',
       'Feature_Preference_EnergyBoost', 'Feature_Preference_Packaging',
       'Feature_Preference_Price', 'Feature_Preference_Availability']]
y = df_copy['Purchase_Intention']

In [29]:
# train and test models
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [30]:
# model creation
model = LogisticRegression(max_iter = 500)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [33]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred) * 100
print("Accuracy Score: ", accuracy, '%')

Accuracy Score:  60.0 %


In [34]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.36      0.07      0.12        57
           1       0.62      0.92      0.74        93

    accuracy                           0.60       150
   macro avg       0.49      0.50      0.43       150
weighted avg       0.52      0.60      0.50       150



In [36]:
# Test Marketing
repeat_purchase = df_copy['Expected_Repeat_Purchase_Rate'].mean() * 100
print("The expected repeat purchase rate is ", round(repeat_purchase,2), "%")

The expected repeat purchase rate is  54.3 %


In [37]:
# Commercilization
if purchase_intent_rate > 50 and avg_wtp > 50 and repeat_purchase > 50:
  decision = 'Go ahead and Launch'
else:
  decision = 'Do not launch'
print(decision)

Go ahead and Launch
