In [1]:
from globals import * 
from utils import * 



## Is there a correlation between cultural values and comfortablity sharing data?
--- 

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, chi2_contingency

# Define mappings
importance_mapping = {
    'Extremely  Important': 5,
    'Very Important': 4,
    'Moderately Important': 3,
    'Somewhat Important': 2,
    'Slightly Important': 1,
    'Not Familiar At All': 0  # Assuming this is the lowest importance
}



comfort_mapping = {
    'Extremely': 5,
    'Somewhat': 3,
    'No': 1,
    'Extremely Uncomfortable': 1,
    'Somewhat Comfortable': 4,
    'Yes': 4  # Inferred from context
}

# Define relevant columns
value_columns = {
    'Individualism': 'Q5.1_1',  # Adjust based on actual column name
    'Collectivism': 'Q5.1_2',   # Adjust based on actual column name
    'Family Values': 'Q5.1_14'  # Adjust based on actual column name
}

comfort_columns = [
    'Q3.3', 'Q3.4', 'Q3.5', 'Q3.6_1', 'Q3.6_2', 'Q3.6_3', 'Q3.6_4',
    'Q3.6_5', 'Q3.6_6', 'Q3.6_7', 
]

def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None

def map_responses(df, mapping, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].map(mapping).fillna(0)  # Fill NaN with 0 for unmapped responses
    return df

def calculate_overall_comfort(df, comfort_columns):
    df['Overall_Comfort'] = df[comfort_columns].mean(axis=1, skipna=True)
    return df

def perform_spearman_test(df, value_col, comfort_cols):
    results = {}
    for comfort_col in comfort_cols:
        if value_col in df.columns and comfort_col in df.columns:
            valid_data = df[[value_col, comfort_col]].dropna()
            if len(valid_data) > 1:  # Ensure enough data for correlation
                correlation, p_value = spearmanr(valid_data[value_col], valid_data[comfort_col])
                results[comfort_col] = {'correlation': correlation, 'p_value': p_value}
    return results

def perform_chi_square_test(df, value_col, comfort_col):
    if value_col in df.columns and comfort_col in df.columns:
        # Create contingency table
        contingency_table = pd.crosstab(df[value_col], df[comfort_col])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        return {
            'chi2': chi2,
            'p_value': p,
            'degrees_of_freedom': dof,
            'expected': expected
        }
    return None

def analyze_relationships(df):
    # Map responses to numerical values
    df = map_responses(df, importance_mapping, value_columns.values())
    df = map_responses(df, comfort_mapping, comfort_columns)
    df = calculate_overall_comfort(df, comfort_columns)

    # Perform tests for each value and individual comfort levels
    for value_name, value_col in value_columns.items():
        print(f"\nAnalyzing {value_name}:")
        
        # Spearman correlation for individual comfort levels
        spearman_results = perform_spearman_test(df, value_col, comfort_columns)
        print(f"Spearman Correlation with Individual Comfort Levels:")
        for comfort_col, result in spearman_results.items():
            print(f"  {comfort_col}: Correlation = {result['correlation']:.3f}, p-value = {result['p_value']:.3f}")

        # Chi-square test for individual comfort levels
        print(f"Chi-Square Test with Individual Comfort Levels:")
        for comfort_col in comfort_columns:
            chi2_result = perform_chi_square_test(df, value_col, comfort_col)
            if chi2_result:
                print(f"  {comfort_col}: Chi2 = {chi2_result['chi2']:.3f}, p-value = {chi2_result['p_value']:.3f}")

        # Spearman correlation with overall comfort
        valid_data = df[[value_col, 'Overall_Comfort']].dropna()
        if len(valid_data) > 1:
            correlation, p_value = spearmanr(valid_data[value_col], valid_data['Overall_Comfort'])
            print(f"Spearman Correlation with Overall Comfort: Correlation = {correlation:.3f}, p-value = {p_value:.3f}")

        # Chi-square test with overall comfort (categorized if needed)
        df['Overall_Comfort_Category'] = pd.qcut(df['Overall_Comfort'], 4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
        chi2_result = perform_chi_square_test(df, value_col, 'Overall_Comfort_Category')
        if chi2_result:
            print(f"Chi-Square Test with Overall Comfort: Chi2 = {chi2_result['chi2']:.3f}, p-value = {chi2_result['p_value']:.3f}")

# Main execution
df = read_and_clean_csv("../data/survey_finalized.csv")
if df is not None:
    analyze_relationships(df)
    print("Analysis complete.")

Successfully read ../data/survey_finalized.csv.

Analyzing Individualism:
Spearman Correlation with Individual Comfort Levels:
  Q3.3: Correlation = 0.215, p-value = 0.003
  Q3.4: Correlation = 0.274, p-value = 0.000
  Q3.5: Correlation = nan, p-value = nan
  Q3.6_1: Correlation = 0.182, p-value = 0.014
  Q3.6_2: Correlation = 0.196, p-value = 0.008
  Q3.6_3: Correlation = 0.148, p-value = 0.046
  Q3.6_4: Correlation = 0.091, p-value = 0.222
  Q3.6_5: Correlation = 0.073, p-value = 0.323
  Q3.6_6: Correlation = 0.108, p-value = 0.146
  Q3.6_7: Correlation = 0.030, p-value = 0.689
Chi-Square Test with Individual Comfort Levels:
  Q3.3: Chi2 = 11.588, p-value = 0.072
  Q3.4: Chi2 = 32.873, p-value = 0.000
  Q3.5: Chi2 = 0.000, p-value = 1.000
  Q3.6_1: Chi2 = 10.170, p-value = 0.118
  Q3.6_2: Chi2 = 12.089, p-value = 0.060
  Q3.6_3: Chi2 = 6.867, p-value = 0.333
  Q3.6_4: Chi2 = 5.067, p-value = 0.535
  Q3.6_5: Chi2 = 8.353, p-value = 0.213
  Q3.6_6: Chi2 = 11.687, p-value = 0.069
  Q3.6

  correlation, p_value = spearmanr(valid_data[value_col], valid_data[comfort_col])
  correlation, p_value = spearmanr(valid_data[value_col], valid_data[comfort_col])
  correlation, p_value = spearmanr(valid_data[value_col], valid_data[comfort_col])


In [18]:
# Cell to analyze correlation between importance of values and influence of cultural background

# Define mappings
importance_mapping = {
    'Extremely Important': 5,
    'Very Important': 4,
    'Moderately Important': 3,
    'Somewhat Important': 2,
    'Slightly Important': 1,
    'Not Familiar At All': 0
}

influence_mapping = {
    'Extremely Significant Influence': 4,
    'Significant Influence': 3,
    'Moderate Influence': 2,
    'Minimal Influence': 1,
    'No Influence': 0
}

# Define value columns and influence column
value_columns = {
    'Individualism': 'Q5.1_1',
    'Collectivism': 'Q5.1_2',
    'Family Values': 'Q5.1_14'
}

def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None


df = read_and_clean_csv("../data/survey_finalized.csv")


influence_column = 'Q7.1'

def map_responses(df, mapping, columns):
    """Map categorical responses to numerical values."""
    for col in columns:
        if col in df.columns:
            unique_vals_before = df[col].dropna().unique()
            print(f"Unique values in {col} before mapping: {unique_vals_before}")
            df[col] = df[col].map(mapping).fillna(0)
            unique_vals_after = df[col].dropna().unique()
            print(f"Unique values in {col} after mapping: {unique_vals_after}")
            if df[col].eq(0).all() and len(unique_vals_before) > 1:
                print(f"Warning: All values in {col} mapped to 0 despite non-constant input. Check mapping keys: {mapping.keys()}")
        else:
            print(f"Warning: Column {col} not found in DataFrame.")
    return df

def perform_spearman_test(df, value_col, target_col):
    """Perform Spearman correlation test between a value and a target column."""
    if value_col in df.columns and target_col in df.columns:
        valid_data = df[[value_col, target_col]].dropna()
        if len(valid_data) > 1:
            if valid_data[value_col].nunique() == 1 or valid_data[target_col].nunique() == 1:
                return {'correlation': None, 'p_value': None}
            else:
                correlation, p_value = spearmanr(valid_data[value_col], valid_data[target_col])
                return {'correlation': correlation, 'p_value': p_value}
        else:
            return {'correlation': None, 'p_value': None}
    else:
        return {'correlation': None, 'p_value': None}

def perform_chi_square_test(df, value_col, target_col):
    """Perform Chi-square test between a value and a target column."""
    if value_col in df.columns and target_col in df.columns:
        contingency_table = pd.crosstab(df[value_col], df[target_col])
        if contingency_table.size == 0 or contingency_table.shape[0] == 1 or contingency_table.shape[1] == 1:
            return None
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        if (expected < 5).any():
            print(f"Warning: Some expected frequencies are less than 5 for {value_col} vs. {target_col}. Chi-square results may be unreliable.")
        return {
            'chi2': chi2,
            'p_value': p,
            'degrees_of_freedom': dof,
            'expected': expected
        }
    return None

def analyze_value_influence_correlation(df):
    """Analyze correlation between importance of values and influence of cultural background."""
    # Validate columns
    missing_value_cols = [col for col in value_columns.values() if col not in df.columns]
    if missing_value_cols:
        print(f"Error: Missing value columns: {missing_value_cols}")
        return
    if influence_column not in df.columns:
        print(f"Error: Missing influence column: {influence_column}")
        return

    # Map responses
    df = map_responses(df, importance_mapping, value_columns.values())
    df = map_responses(df, influence_mapping, [influence_column])

    # If all values are 0 after mapping and the input had variety, skip analysis with a clear message
    all_zero = all(df[col].eq(0).all() for col in value_columns.values())
    if all_zero and any(len(df[col].dropna().unique()) > 1 for col in value_columns.values()):
        print("\n=== Correlation Analysis: Importance of Values vs. Influence of Cultural Background on Trust ===")
        print("Error: All value columns mapped to 0 despite non-constant input. Correlation analysis skipped. Please update importance_mapping.")
        return

    # Perform correlation analysis
    print("\n=== Correlation Analysis: Importance of Values vs. Influence of Cultural Background on Trust ===")
    for value_name, value_col in value_columns.items():
        print(f"\nAnalyzing {value_name} vs. Influence of Cultural Background (Q7.1):")
        
        # Spearman correlation
        spearman_result = perform_spearman_test(df, value_col, influence_column)
        if spearman_result['correlation'] is not None:
            print(f"  Spearman Correlation: Correlation = {spearman_result['correlation']:.3f}, p-value = {spearman_result['p_value']:.3f}")
        else:
            print(f"  Spearman Correlation: Insufficient data for correlation or constant values.")

        # Chi-square test
        chi2_result = perform_chi_square_test(df, value_col, influence_column)
        if chi2_result:
            print(f"  Chi-Square Test: Chi2 = {chi2_result['chi2']:.3f}, p-value = {chi2_result['p_value']:.3f}, "
                  f"Degrees of Freedom = {chi2_result['degrees_of_freedom']}")
        else:
            print(f"  Chi-Square Test: Unable to perform Chi-square test (missing data or no variability).")
    print("=" * 50)

# Assuming df is already loaded from the previous main() function
# If running independently, uncomment the following lines:
# df = read_and_clean_csv("../data/survey_finalized.csv")
# if df is not None:
analyze_value_influence_correlation(df)

Successfully read ../data/survey_finalized.csv.
Unique values in Q5.1_1 before mapping: ['How important are the following values to you when it comes to online privacy? - Indivisualism (e.g. Independence)'
 'Extremely Important' 'Slightly Important' 'Moderately Important'
 'Very Important' 'Not at all Important']
Unique values in Q5.1_1 after mapping: [0. 5. 1. 3. 4.]
Unique values in Q5.1_2 before mapping: ['How important are the following values to you when it comes to online privacy? - Collectivism (e.g. Community influence)'
 'Not at all Important' 'Moderately Important' 'Slightly Important'
 'Very Important' 'Extremely Important']
Unique values in Q5.1_2 after mapping: [0. 3. 1. 4. 5.]
Unique values in Q5.1_14 before mapping: ['How important are the following values to you when it comes to online privacy? - Family Values (e.g. Importance of family relationships)'
 'Very Important' 'Extremely Important' 'Moderately Important'
 'Slightly Important' 'Not at all Important']
Unique val

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, chi2_contingency

# Define mappings
importance_mapping = {
    'Extremely Important': 5,
    'Very Important': 4,
    'Moderately Important': 3,
    'Somewhat Important': 2,
    'Slightly Important': 1,
    'Not Familiar At All': 0  # Assuming this is the lowest importance
}

# Mapping for Q9.1 frequency of checking privacy settings
q9_1_mapping = {
    'Yes, Always': 5,
    'Yes, Sometimes': 4,
    'No, Sometimes': 3,  # Adjusted based on image; might be a typo in your data
    'No, Rarely': 2,
    'No, Never': 1
}

# Define relevant columns
value_columns = {
    'Individualism': 'Q5.1_1',  # Adjust based on actual column name
    'Collectivism': 'Q5.1_2',   # Adjust based on actual column name
    'Family Values': 'Q5.1_14'  # Adjust based on actual column name
}

q9_1_column = 'Q9.1'  # Column for frequency of checking privacy settings

def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None

def map_responses(df, mapping, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].map(mapping).fillna(0)  # Fill NaN with 0 for unmapped responses
    return df

def perform_spearman_test(df, value_col, q9_1_col):
    if value_col in df.columns and q9_1_col in df.columns:
        valid_data = df[[value_col, q9_1_col]].dropna()
        if len(valid_data) > 1:  # Ensure enough data for correlation
            correlation, p_value = spearmanr(valid_data[value_col], valid_data[q9_1_col])
            return {'correlation': correlation, 'p_value': p_value}
    return None

def perform_chi_square_test(df, value_col, q9_1_col):
    if value_col in df.columns and q9_1_col in df.columns:
        # Create contingency table
        contingency_table = pd.crosstab(df[value_col], df[q9_1_col])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        return {
            'chi2': chi2,
            'p_value': p,
            'degrees_of_freedom': dof,
            'expected': expected
        }
    return None

def analyze_relationships(df):
    # Map responses to numerical values
    df = map_responses(df, importance_mapping, value_columns.values())
    df = map_responses(df, q9_1_mapping, [q9_1_column])

    # Perform tests for each value and Q9.1
    for value_name, value_col in value_columns.items():
        print(f"\nAnalyzing {value_name} with Q9.1 (Frequency of Checking Privacy Settings):")
        
        # Spearman correlation
        spearman_result = perform_spearman_test(df, value_col, q9_1_column)
        if spearman_result:
            print(f"Spearman Correlation: Correlation = {spearman_result['correlation']:.3f}, p-value = {spearman_result['p_value']:.3f}")

        # Chi-square test
        chi2_result = perform_chi_square_test(df, value_col, q9_1_column)
        if chi2_result:
            print(f"Chi-Square Test: Chi2 = {chi2_result['chi2']:.3f}, p-value = {chi2_result['p_value']:.3f}")

# Main execution
df = read_and_clean_csv("../data/survey_finalized.csv")
if df is not None:
    analyze_relationships(df)
    print("Analysis complete.")