In [2]:
import pandas as pd
import numpy as np

def calculate_phi_coefficient(df):
    """
    Calculate the Phi coefficient for all pairs of questions in the given DataFrame.
    
    :param df: DataFrame where each column is a question and each row is a keyword's response (1 for Yes, 0 for No)
    :return: DataFrame of Phi coefficients
    """
    n_questions = df.shape[1]
    phi_matrix = pd.DataFrame(np.zeros((n_questions, n_questions)), columns=df.columns, index=df.columns)
    
    for i in range(n_questions):
        for j in range(i, n_questions):
            if i == j:
                phi_matrix.iloc[i, j] = 1.0
            else:
                n_11 = np.sum((df.iloc[:, i] == 1) & (df.iloc[:, j] == 1))
                n_10 = np.sum((df.iloc[:, i] == 1) & (df.iloc[:, j] == 0))
                n_01 = np.sum((df.iloc[:, i] == 0) & (df.iloc[:, j] == 1))
                n_00 = np.sum((df.iloc[:, i] == 0) & (df.iloc[:, j] == 0))
                
                n_1_dot = n_11 + n_10
                n_0_dot = n_01 + n_00
                n_dot_1 = n_11 + n_01
                n_dot_0 = n_10 + n_00
                
                numerator = n_11 * n_00 - n_10 * n_01
                denominator = np.sqrt(n_1_dot * n_0_dot * n_dot_1 * n_dot_0)
                
                if denominator == 0:
                    phi = 0
                else:
                    phi = numerator / denominator
                
                phi_matrix.iloc[i, j] = phi
                phi_matrix.iloc[j, i] = phi
    
    return phi_matrix

In [3]:
# Example usage
# Assume `data` is a DataFrame with binary answers (1 for Yes, 0 for No)
data = pd.DataFrame({
    'Q1': [1, 0, 1, 0],
    'Q2': [1, 1, 0, 0],
    'Q3': [0, 1, 0, 1],
    'Q4': [1, 1, 1, 0]
})

phi_matrix = calculate_phi_coefficient(data)
print(phi_matrix)

         Q1       Q2       Q3       Q4
Q1  1.00000  0.00000 -1.00000  0.57735
Q2  0.00000  1.00000  0.00000  0.57735
Q3 -1.00000  0.00000  1.00000 -0.57735
Q4  0.57735  0.57735 -0.57735  1.00000


In [11]:
df = pd.read_csv('mapped_keywords.csv', index_col=False)

df = df.replace({'yes': 1, 'no': 0})
df.head()

Unnamed: 0.1,Unnamed: 0,Is the thing related to food or drink in any way?,Would the keyword be included in the broad category of Machines?,Is it tangible?,Is it water-based?,is it a living thing?,Would the keyword be considered a Home appliance?
0,Advertisement,0,0,0,0,0,0
1,Agave,1,0,1,0,1,0
2,Air compressor,0,1,1,0,0,0
3,Air Conditioner,0,1,1,0,0,1
4,Air filter,0,1,1,0,0,1


In [12]:
phi_matrix = calculate_phi_coefficient(df)
phi_matrix

Unnamed: 0.1,Unnamed: 0,Is the thing related to food or drink in any way?,Would the keyword be included in the broad category of Machines?,Is it tangible?,Is it water-based?,is it a living thing?,Would the keyword be considered a Home appliance?
Unnamed: 0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Is the thing related to food or drink in any way?,0.0,1.0,-0.113641,0.093065,0.004987,0.189002,0.113393
Would the keyword be included in the broad category of Machines?,0.0,-0.113641,1.0,0.052948,0.016086,-0.129733,0.394286
Is it tangible?,0.0,0.093065,0.052948,1.0,0.045349,-0.014089,0.042905
Is it water-based?,0.0,0.004987,0.016086,0.045349,1.0,-0.037633,-0.014744
is it a living thing?,0.0,0.189002,-0.129733,-0.014089,-0.037633,1.0,-0.063425
Would the keyword be considered a Home appliance?,0.0,0.113393,0.394286,0.042905,-0.014744,-0.063425,1.0


In [13]:
def select_next_question(phi_matrix, asked_questions, answers):
    """
    Select the next question to ask based on the Phi coefficients and the current state of asked questions and answers.
    
    :param phi_matrix: DataFrame of Phi coefficients
    :param asked_questions: List of questions that have been asked
    :param answers: List of answers corresponding to the asked questions (1 for Yes, 0 for No)
    :return: The next question to ask
    """
    # Filter the questions that have not been asked yet
    remaining_questions = [q for q in phi_matrix.columns if q not in asked_questions]
    
    # If no questions have been asked yet, return the first question
    if not asked_questions:
        return remaining_questions[0]
    
    # Calculate the correlation of remaining questions with the current state
    correlations = []
    for question in remaining_questions:
        # Calculate the average correlation with the answered questions
        avg_corr = np.mean([phi_matrix.loc[question, asked_question] for asked_question in asked_questions])
        correlations.append((question, avg_corr))
    
    # Select the question with the maximum average correlation
    next_question = max(correlations, key=lambda x: x[1])[0]
    
    return next_question

In [14]:
phi_matrix = calculate_phi_coefficient(df)

# Track the questions asked and their answers
asked_questions = ['is it a living thing?']
answers = [1]  # Assume the answer to Q1 was 'yes'

# Select the next question to ask
next_question = select_next_question(phi_matrix, asked_questions, answers)
print("Next question to ask:", next_question)

Next question to ask: Is the thing related to food or drink in any way?


In [15]:
phi_matrix = calculate_phi_coefficient(df)

# Track the questions asked and their answers
asked_questions = ['is it a living thing?', 'Is the thing related to food or drink in any way?']
answers = [1, 0]  # Assume the answer to Q1 was 'yes'

# Select the next question to ask
next_question = select_next_question(phi_matrix, asked_questions, answers)
print("Next question to ask:", next_question)

Next question to ask: Is it tangible?
