In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.stats.power import TTestIndPower, GofChisquarePower

# 🔹 Step 1: Load Data
file_path = "data.xlsx"
df = pd.read_excel(file_path, sheet_name="All")

# 🔹 Step 2: Convert Responses to Numerical Values
conversion_dict = {"Phishing": 1, "Phishing": 1, "Legitimate": 0, "Don't know": np.nan}

# Apply conversion dictionary to all question columns
question_cols = [col for col in df.columns if "Question" in col]
df[question_cols] = df[question_cols].apply(lambda col: col.map(conversion_dict))

# 🔹 Step 3: Define Correct Answers
correct_answers = {
    "Question 1": 1, "Question 2": 1, "Question 3": 1, "Question 4": 0, "Question 5": 1,
    "Question 6": 1, "Question 7": 1, "Question 8": 1, "Question 9": 1, "Question 10": 0
}

# Calculate Accuracy for each participant
df["Accuracy"] = df[question_cols].apply(
    lambda row: np.nanmean([1 if row[q] == correct_answers[q] else 0 for q in question_cols]), axis=1
)

# 🔹 Step 4: Perform T-Test and Compute Minimum Sample Size
def power_analysis_ttest(df):
    """Performs a T-Test and calculates the required sample size for statistical significance."""

    # Identify the first column dynamically
    device_col = df.columns[0]
    print(f"ℹ️ Using the first column for grouping: {device_col}")

    # Get unique values (groups)
    unique_values = df[device_col].dropna().unique()
    if len(unique_values) < 2:
        print("⚠️ Not enough groups in the first column for T-Test.")
        return

    # Extract accuracy scores for each group
    group1_scores = df[df[device_col] == unique_values[0]]["Accuracy"].dropna()
    group2_scores = df[df[device_col] == unique_values[1]]["Accuracy"].dropna()

    # Check for sufficient data
    if len(group1_scores) == 0 or len(group2_scores) == 0:
        print("⚠️ Not enough data for one of the groups.")
        return
    if np.var(group1_scores) == 0 or np.var(group2_scores) == 0:
        print("⚠️ Cannot compute T-Test - Variance is zero in one of the groups.")
        return

    # Perform T-Test
    t_stat, p_value = ttest_ind(group1_scores, group2_scores, equal_var=False)

    # Compute power and effect size
    effect_size = abs(np.mean(group1_scores) - np.mean(group2_scores)) / np.sqrt((np.var(group1_scores) + np.var(group2_scores)) / 2)
    power_analysis = TTestIndPower()
    power = power_analysis.solve_power(effect_size=effect_size, alpha=0.05, nobs1=len(group1_scores), ratio=len(group2_scores) / len(group1_scores))

    # Compute required sample size for Power > 0.8
    min_sample_size = power_analysis.solve_power(effect_size=effect_size, alpha=0.05, power=0.8, ratio=len(group2_scores) / len(group1_scores))

    print(f"🔹 T-Test ({unique_values[0]} vs. {unique_values[1]}) - Statistical Power: {power:.3f}")
    print(f"📏 Required sample size for significance (Power > 0.8): {int(min_sample_size)} samples")

# 🔹 Step 5: Perform Chi-Square Test and Compute Minimum Sample Size
def power_analysis_chi2(df):
    """Performs a Chi-Square test to assess the relationship between device type and phishing detection accuracy."""

    device_col = df.columns[0]
    print(f"ℹ️ Using the first column for grouping: {device_col}")

    # Create a binary accuracy column (Success/Failure)
    df["Accuracy_Binary"] = df["Accuracy"].apply(lambda x: 1 if x >= 0.5 else 0)
    contingency_table = pd.crosstab(df[device_col], df["Accuracy_Binary"])

    if contingency_table.shape[0] < 2 or contingency_table.shape[1] < 2:
        print("⚠️ Not enough data for Chi-Square test.")
        return

    # Compute Chi-Square test
    chi2_stat, p, dof, expected = chi2_contingency(contingency_table)

    # Compute statistical power
    effect_size = np.sqrt(chi2_stat / np.sum(contingency_table.values))
    power_analysis = GofChisquarePower()
    power = power_analysis.solve_power(effect_size=effect_size, nobs=np.sum(contingency_table.values), alpha=0.05)

    # Compute required sample size for Power > 0.8
    min_sample_size = power_analysis.solve_power(effect_size=effect_size, alpha=0.05, power=0.8)

    print(f"🔹 Chi-Square Test ({device_col} vs. Accuracy) - Statistical Power: {power:.3f}")
    print(f"📏 Required sample size for significance (Power > 0.8): {int(min_sample_size)} samples")

# 🔍 Execute Both Tests
print("\n🔍 Statistical Power Analysis - T-Test 📊\n")
power_analysis_ttest(df)

print("\n🔍 Statistical Power Analysis - Chi-Square Test 📊\n")
power_analysis_chi2(df)



🔍 Statistical Power Analysis - T-Test 📊

ℹ️ Using the first column for grouping: I'm answering the survey on a:
🔹 T-Test (PC vs. Smartphone) - Statistical Power: 0.967
📏 Required sample size for significance (Power > 0.8): 27 samples

🔍 Statistical Power Analysis - Chi-Square Test 📊

ℹ️ Using the first column for grouping: I'm answering the survey on a:
🔹 Chi-Square Test (I'm answering the survey on a: vs. Accuracy) - Statistical Power: 0.630
📏 Required sample size for significance (Power > 0.8): 149 samples


In [7]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from statsmodels.stats.power import NormalIndPower

# 🔹 Load Data
file_path = "HCI_PROJECT.xlsx"
df = pd.read_excel(file_path, sheet_name="ALL")

# 🔹 Convert Responses to Numerical Values
conversion_dict = {"Phishing": 1, "Legitimate": 0, "Don't know": np.nan}

# Apply conversion dictionary to all question columns
question_cols = [col for col in df.columns if "Question" in col]
df[question_cols] = df[question_cols].apply(lambda col: col.map(conversion_dict))

# 🔹 Define Correct Answers
correct_answers = {
    "Question 1": 1, "Question 2": 1, "Question 3": 1, "Question 4": 0, "Question 5": 1,
    "Question 6": 1, "Question 7": 1, "Question 8": 1, "Question 9": 1, "Question 10": 0
}

# Calculate Accuracy for each participant
df["Accuracy"] = df[question_cols].apply(
    lambda row: np.nanmean([1 if row[q] == correct_answers[q] else 0 for q in question_cols]), axis=1
)

# Identify the first column dynamically (used for grouping)
device_col = df.columns[0]

# Get unique values (groups)
unique_values = df[device_col].dropna().unique()

# Extract accuracy scores for each group
group1_scores = df[df[device_col] == unique_values[0]]["Accuracy"].dropna()
group2_scores = df[df[device_col] == unique_values[1]]["Accuracy"].dropna()

# Perform Mann-Whitney U test
u_stat, p_value = mannwhitneyu(group1_scores, group2_scores, alternative='two-sided')

# Compute effect size (Common Language Effect Size - CL)
n1, n2 = len(group1_scores), len(group2_scores)
cl_effect_size = u_stat / (n1 * n2)

# Compute power and required sample size
power_analysis = NormalIndPower()
power = power_analysis.solve_power(effect_size=cl_effect_size, alpha=0.05, nobs1=n1, ratio=n2/n1)

# Compute required sample size for Power > 0.8
min_sample_size = power_analysis.solve_power(effect_size=cl_effect_size, alpha=0.05, power=0.8, ratio=n2/n1)

# Print results
print(f"🔹 Mann-Whitney U Test ({unique_values[0]} vs. {unique_values[1]})")
print(f"   - U Statistic: {u_stat:.3f}")
print(f"   - P-Value: {p_value:.5f}")
print(f"   - Effect Size (CL): {cl_effect_size:.3f}")
print(f"   - Statistical Power: {power:.3f}")
print(f"   - Required Sample Size for Power > 0.8: {int(min_sample_size)} samples")

# Interpretation
if p_value < 0.05:
    print("\n✅ There is a statistically significant difference between the groups.")
else:
    print("\n❌ No statistically significant difference was found between the groups.")

if power >= 0.8:
    print("✅ The statistical power is sufficient (Power > 0.8), meaning the test results are reliable.")
else:
    print(f"⚠️ The statistical power is low ({power:.2f}), which may indicate that the sample size is too small.")

if n1 >= min_sample_size and n2 >= min_sample_size:
    print("✅ The current sample size is sufficient to draw conclusions.")
else:
    print(f"⚠️ The current sample size is too small. It is recommended to have at least {int(min_sample_size)} samples per group.")



🔹 Mann-Whitney U Test (PC vs. Smartphone)
   - U Statistic: 233.000
   - P-Value: 0.00003
   - Effect Size (CL): 0.925
   - Statistical Power: 0.737
   - Required Sample Size for Power > 0.8: 16 samples

✅ There is a statistically significant difference between the groups.
⚠️ The statistical power is low (0.74), which may indicate that the sample size is too small.
⚠️ The current sample size is too small. It is recommended to have at least 16 samples per group.


In [9]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu

# 🔹 Load Data
file_path = "data.xlsx"
df = pd.read_excel(file_path, sheet_name="All")

# 🔹 Convert Responses to Numerical Values
conversion_dict = {"Phishing": 1, "Legitimate": 0, "Don't know": np.nan}

# Apply conversion dictionary to all question columns
question_cols = [col for col in df.columns if "Question" in col]
df[question_cols] = df[question_cols].apply(lambda col: col.map(conversion_dict))

# 🔹 Define Correct Answers
correct_answers = {
    "Question 1": 1, "Question 2": 1, "Question 3": 1, "Question 4": 0, "Question 5": 1,
    "Question 6": 1, "Question 7": 1, "Question 8": 1, "Question 9": 1, "Question 10": 0
}

# Calculate Accuracy for each participant
df["Accuracy"] = df[question_cols].apply(
    lambda row: np.nanmean([1 if row[q] == correct_answers[q] else 0 for q in question_cols]), axis=1
)

# Identify the first column dynamically (used for grouping)
device_col = df.columns[0]

# Get unique values (groups)
unique_values = df[device_col].dropna().unique()

# Extract accuracy scores for each group
group1_scores = df[df[device_col] == unique_values[0]]["Accuracy"].dropna().values
group2_scores = df[df[device_col] == unique_values[1]]["Accuracy"].dropna().values

# Compute observed difference (e.g., mean difference)
observed_diff = np.abs(np.mean(group1_scores) - np.mean(group2_scores))

# 🔹 Perform Permutation Test
num_permutations = 10_000  # Number of resamples
perm_diffs = []

combined_data = np.concatenate([group1_scores, group2_scores])

for _ in range(num_permutations):
    np.random.shuffle(combined_data)  # Shuffle the labels
    new_group1 = combined_data[:len(group1_scores)]
    new_group2 = combined_data[len(group1_scores):]
    perm_diffs.append(np.abs(np.mean(new_group1) - np.mean(new_group2)))

# Compute p-value
p_value = np.mean(np.array(perm_diffs) >= observed_diff)

# Print results
print("🔹 Permutation Test (PC vs. Smartphone)")
print("   This test checks whether the observed difference in accuracy between PC and Smartphone users")
print("   is statistically significant, or if it could have occurred by random chance.")
print("   The test works by randomly shuffling the labels and recalculating the difference many times.")
print("   If the observed difference is extreme compared to these random permutations, we conclude that the effect is real.")
print(f"\n   - Observed Difference: {observed_diff:.4f}")
print(f"   - P-Value: {p_value:.5f}")

if p_value < 0.05:
    print("\n✅ The observed difference is statistically significant (p < 0.05).")
    print("   This means that the accuracy difference between PC and Smartphone users is unlikely to be due to random chance.")
else:
    print("\n❌ The observed difference is NOT statistically significant (p ≥ 0.05).")
    print("   This means we do not have enough evidence to conclude that the accuracy difference is real.")



🔹 Permutation Test (PC vs. Smartphone)
   This test checks whether the observed difference in accuracy between PC and Smartphone users
   is statistically significant, or if it could have occurred by random chance.
   The test works by randomly shuffling the labels and recalculating the difference many times.
   If the observed difference is extreme compared to these random permutations, we conclude that the effect is real.

   - Observed Difference: 0.1280
   - P-Value: 0.00010

✅ The observed difference is statistically significant (p < 0.05).
   This means that the accuracy difference between PC and Smartphone users is unlikely to be due to random chance.


In [11]:
import numpy as np
import pandas as pd

# 🔹 Load Data
file_path = "data.xlsx"
df = pd.read_excel(file_path, sheet_name="All")

# 🔹 Convert Responses to Numerical Values
conversion_dict = {"Phishing": 1, "Legitimate": 0, "Don't know": np.nan}

# Apply conversion dictionary to all question columns
question_cols = [col for col in df.columns if "Question" in col]
df[question_cols] = df[question_cols].apply(lambda col: col.map(conversion_dict))

# 🔹 Define Correct Answers
correct_answers = {
    "Question 1": 1, "Question 2": 1, "Question 3": 1, "Question 4": 0, "Question 5": 1,
    "Question 6": 1, "Question 7": 1, "Question 8": 1, "Question 9": 1, "Question 10": 0
}

# Calculate Accuracy for each participant
df["Accuracy"] = df[question_cols].apply(
    lambda row: np.nanmean([1 if row[q] == correct_answers[q] else 0 for q in question_cols]), axis=1
)

# Identify the first column dynamically (used for grouping)
device_col = df.columns[0]

# Get unique values (groups)
unique_values = df[device_col].dropna().unique()

# Extract accuracy scores for each group
group1_scores = df[df[device_col] == unique_values[0]]["Accuracy"].dropna().values
group2_scores = df[df[device_col] == unique_values[1]]["Accuracy"].dropna().values

# 🔹 Compute Cliff’s Delta
def cliffs_delta(x, y):
    """
    Computes Cliff's Delta effect size, measuring the probability that a randomly chosen value
    from one distribution is larger than a randomly chosen value from another distribution.
    """
    n_x, n_y = len(x), len(y)
    greater = sum(1 for i in x for j in y if i > j)
    lesser = sum(1 for i in x for j in y if i < j)

    delta = (greater - lesser) / (n_x * n_y)
    return delta

# Compute Cliff’s Delta for the dataset
delta_value = cliffs_delta(group1_scores, group2_scores)

# Interpretation scale
if abs(delta_value) < 0.147:
    interpretation = "small effect"
elif abs(delta_value) < 0.33:
    interpretation = "medium effect"
elif abs(delta_value) < 0.474:
    interpretation = "large effect"
else:
    interpretation = "very large effect"

# Print results
print("🔹 Cliff’s Delta Effect Size (PC vs. Smartphone)")
print("   This measures the probability that a randomly selected accuracy score")
print("   from one group is higher than a randomly selected score from another group.")
print(f"\n   - Cliff’s Delta: {delta_value:.4f}")
print(f"   - Interpretation: {interpretation}")


🔹 Cliff’s Delta Effect Size (PC vs. Smartphone)
   This measures the probability that a randomly selected accuracy score
   from one group is higher than a randomly selected score from another group.

   - Cliff’s Delta: 0.3916
   - Interpretation: large effect
