# **5-3 - Conjoint Analysis**

Marketing and Customer Analytics
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# **1. Business Understanding**
---

- Company ABC is SaaS platform for online stores, e-commerce, and retail point-of-sale systems
- The ABC platform offers online retailers a suite of services including payments, marketing, shipping and customer engagement tools


## **1.2 Business Objective**
---
- Company ABC wants to know which attribute of the platform is the most prefered for customers
- Company uses the information to determine which marketing messages is the most suitable to be advertized and at what price

## **1.3 Modelling Task**
---

- Output target: customer choice
- The goal of this project is model customer choice (Choice Based Conjoint Method / CBC)
- Modelling task: Classification
- We need interpretable model, so model used: Logistic Regression
- Our target response is imbalance, so we consider use f1-score as evaluation metric

In [2]:
OUTPUT_PATH=''
INPUT_PATH=''

In [30]:
class Model:
    def __init__(self, fname_data, fname_attribute):
        self.fname_data = fname_data
        self.fname_attribute = fname_attribute

        # Read data & attribute
        self._read_data(column_to_drop=["Email", "Business Year"])
        self._read_attribute()

        # Create encoder
        self._create_encoder()

        # Prepare data
        self._prepare()
        print(self.y_train.value_counts())


    def _read_data(self, column_to_drop):
        # read data
        conjoint_df = pd.read_csv(OUTPUT_PATH + self.fname_data)

        # drop column
        conjoint_df.drop(columns = column_to_drop,
                        inplace = True)

        self.data = conjoint_df

    def _read_attribute(self):
        list_attribute = pd.read_csv(self.fname_attribute)

        self.list_attribute = list_attribute

    def _create_encoder(self):
        """Create encoder from list of attributes"""
        # Create encoder
        encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        encoder = encoder.fit(self.list_attribute)

        self.ohe_enc = encoder

    def _split_input_output(self, target_column=['Choice']):
        """
        Function to separate input & output

        Parameters
        ----------

        data: pandas dataframe
            sampel data

        target_column: list
          Columns output


        column_to_drop: list
            Columns to drop

        Returns
        -------
        y: pandas Series
            output

        X: pandas Dataframe
            input
        """
        # cari output
        y = self.data[target_column]

        # cari input
        X = self.data.drop(target_column,
                           axis = 1)

        return X, y

    def _split_train_test(self, X, y,
                          test_size=0.2,
                          seed = 123):
        """
        Function to separate data train & test

        Parameters
        ----------


        X: pandas Dataframe
            sampel input

        y: pandas Series
            sampel output

        test_size: float
              test data proportion

        seed: int
            random state

        Returns
        -------

        X_train: pandas Dataframe
            train input

        X_test: pandas Dataframe
            test input

        y_train: pandas Series
            train output

        y_test: pandas Series
            test output
        """
        X_train, X_test, \
            y_train, y_test = train_test_split(X, y,
                                            stratify = y,
                                            test_size = test_size,
                                            random_state = seed)

        return X_train, X_test, y_train, y_test

    def _transform_encoder(self, data):
        """Transfrom encoder"""
        features = []
        for ohe_cat in self.ohe_enc.categories_:
            features.extend(list(ohe_cat))

        # Transform
        data_enc = pd.DataFrame(data = self.ohe_enc.transform(data),
                                columns = features,
                                index = data.index)

        print('Data encoded shape:', data_enc.shape)

        return data_enc

    def _create_scaler(self, data):
        scaler = StandardScaler()
        scaler.fit(data)

        self.scaler = scaler

    def _transform_scaler(self, data):
        data_scaled = pd.DataFrame(self.scaler.transform(data))

        data_scaled.columns = data.columns
        data_scaled.index = data.index

        return data_scaled

    def _prepare(self):
        # Split input output
        X, y = self._split_input_output()

        # Split train test
        X_train, X_test, y_train, y_test = self._split_train_test(X, y)

        # Encode train test
        X_train_enc = self._transform_encoder(X_train)
        X_test_enc = self._transform_encoder(X_test)

        # Standardize
        self._create_scaler(X_train_enc)
        X_train_clean = self._transform_scaler(X_train_enc)
        X_test_clean = self._transform_scaler(X_test_enc)

        self.X_train_clean = X_train_clean
        self.X_test_clean = X_test_clean
        self.y_train = y_train
        self.y_test = y_test

    def fit(self):
        self.clf = LogisticRegression(penalty= None,
                                      class_weight='balanced')
        self.clf.fit(self.X_train_clean, self.y_train)

    def score(self):
        train_score = f1_score(self.y_train, self.clf.predict(self.X_train_clean))
        test_score = f1_score(self.y_test, self.clf.predict(self.X_test_clean))

        print('F1 score train :', train_score)
        print('F1 score test  :', test_score)

    def weight_summary(self):
        summary = pd.DataFrame({'features': self.clf.feature_names_in_.tolist() + ['constant'],
                                'weights': self.clf.coef_[0].tolist() + self.clf.intercept_.tolist()})
        summary = summary.sort_values(by='weights', ascending=False)

        return summary



# **2. Modelling & Get Weights**
---

## **2.1 Full Respondent Conjoint**
---

In [31]:
obj_full = Model(fname_data='full_respond.csv', fname_attribute='Attribute_Conjoint.csv')
obj_full.fit()
obj_full.score()
obj_full.weight_summary()

Data encoded shape: (7152, 16)
Data encoded shape: (1788, 16)
Choice
0         4707
1         2445
Name: count, dtype: int64
F1 score train : 0.4735708367854184
F1 score test  : 0.4802146210596915


Unnamed: 0,features,weights
5,$45,0.114921
12,Integration with Third-Party Apps,0.108149
11,Customer Support,0.092228
2,$30,0.074117
14,Shipping and Fulfillment,0.051852
4,$40,0.021738
15,Website Customization,0.013716
3,$35,0.006826
16,constant,-0.01388
13,SEO and Marketing Features,-0.015616


## **2.2 Less 1 Year Business Respondent Conjoint**
---

In [33]:
obj_less = Model(fname_data='less_than_year_respond.csv', fname_attribute='Attribute_Conjoint.csv')
obj_less.fit()
obj_less.score()
obj_less.weight_summary()

Data encoded shape: (2496, 16)
Data encoded shape: (624, 16)
Choice
0         1726
1          770
Name: count, dtype: int64
F1 score train : 0.4453589391860997
F1 score test  : 0.4530386740331492


Unnamed: 0,features,weights
5,$45,0.125208
14,Shipping and Fulfillment,0.087471
11,Customer Support,0.056635
12,Integration with Third-Party Apps,0.049181
2,$30,0.046791
15,Website Customization,0.03312
13,SEO and Marketing Features,0.019249
3,$35,0.011664
7,$55,-0.011447
4,$40,-0.013431


## **2.3 More 1 Year Business Respondent Conjoint**
---

In [35]:
obj_more = Model(fname_data='more_than_year_respond.csv', fname_attribute='Attribute_Conjoint.csv')
obj_more.fit()
obj_more.score()
obj_more.weight_summary()

Data encoded shape: (4656, 16)
Data encoded shape: (1164, 16)
Choice
0         2981
1         1675
Name: count, dtype: int64
F1 score train : 0.48193384223918573
F1 score test  : 0.4851586489252815


Unnamed: 0,features,weights
12,Integration with Third-Party Apps,0.159941
5,$45,0.115349
11,Customer Support,0.094135
2,$30,0.067728
4,$40,0.058771
14,Shipping and Fulfillment,0.052479
13,SEO and Marketing Features,-0.004162
15,Website Customization,-0.010369
16,constant,-0.015666
3,$35,-0.016559


Recommendation:
- Price: \$ $45$, \$30, \$40
- Attribute : Integration with Third-Party Apps, Customer Support, Shipping and Fulfillment

<br>

Indeed, the result that makes more sense is when the price is sequential from cheap to high, but with results like this we can still suggest a price between \$$30-\$$45 for monthly subscription plan

<br>

Also, we can focus make campaign on top 3 attribute to attract customer attention

# **Further Recommendation**

- Since our evaluation score is not good enough, We need to add more data to increase the evaluation score (increase number of respondent)
- We can also clean the data, e.g., exclude user who fill the answer with >90% similar answer