# Kilter Board Predictor - Data Exploration

This notebook demonstrates the data exploration and basic statistics capabilities of the Kilter Board Predictor project. It shows how to analyze climbing route data to prepare for machine learning model training.

In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from kilter_board_predictor import DataExplorer, BasicStatistics

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

## 1. Create Sample Climbing Route Data

Since we don't have real data yet, let's create a sample dataset that represents typical climbing route data with features that might be relevant for predicting route difficulty.

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Create sample climbing route data
n_routes = 1000

# Generate synthetic climbing route data
data = {
    'route_id': range(1, n_routes + 1),
    'angle': np.random.choice([0, 20, 30, 40, 50, 60, 70], n_routes, p=[0.1, 0.15, 0.2, 0.25, 0.15, 0.1, 0.05]),
    'num_holds': np.random.randint(8, 25, n_routes),
    'height': np.random.normal(3.5, 0.8, n_routes),  # Height in meters
    'wall_type': np.random.choice(['vertical', 'overhang', 'slab'], n_routes, p=[0.4, 0.5, 0.1]),
    'hold_type': np.random.choice(['crimps', 'jugs', 'slopers', 'pinches', 'mixed'], n_routes, p=[0.3, 0.2, 0.2, 0.1, 0.2]),
    'setter_experience': np.random.randint(1, 10, n_routes),  # Years of experience
    'completion_rate': np.random.beta(2, 3, n_routes),  # Success rate (0-1)
    'average_attempts': np.random.poisson(3, n_routes) + 1,
    'grade_v_scale': np.random.randint(0, 17, n_routes),  # V0 to V16
}

# Create correlations to make data more realistic
# Higher angles tend to have higher grades
angle_bonus = (data['angle'] / 70) * 3
data['grade_v_scale'] = np.clip(data['grade_v_scale'] + angle_bonus, 0, 16).astype(int)

# More holds generally means easier routes
hold_penalty = (data['num_holds'] - 15) * -0.2
data['grade_v_scale'] = np.clip(data['grade_v_scale'] + hold_penalty, 0, 16).astype(int)

# Adjust completion rate based on grade
grade_difficulty = data['grade_v_scale'] / 16
data['completion_rate'] = np.clip(1 - grade_difficulty * 0.7 + np.random.normal(0, 0.1, n_routes), 0.05, 0.95)

# Create DataFrame
df = pd.DataFrame(data)

# Add some missing values to make it realistic
missing_indices = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
df.loc[missing_indices, 'completion_rate'] = np.nan

missing_indices_2 = np.random.choice(df.index, size=int(0.02 * len(df)), replace=False)
df.loc[missing_indices_2, 'height'] = np.nan

print(f"Created sample dataset with {len(df)} routes")
print(f"Shape: {df.shape}")
df.head(10)

## 2. Data Exploration with DataExplorer Class

Let's use our DataExplorer class to analyze the dataset.

In [None]:
# Initialize the DataExplorer
explorer = DataExplorer(df)

# Get comprehensive data overview
overview = explorer.data_overview()

In [None]:
# Analyze missing values
missing_analysis = explorer.missing_values_analysis(visualize=True)

In [None]:
# Plot distributions of numeric variables
print("=== Distribution Analysis ===")
explorer.plot_distributions(plot_type='histogram')

In [None]:
# Create box plots for outlier detection
print("=== Box Plots for Outlier Detection ===")
explorer.plot_distributions(plot_type='box')

In [None]:
# Analyze correlations
print("=== Correlation Analysis ===")
corr_matrix = explorer.correlation_analysis(method='pearson', visualize=True)

In [None]:
# Plot categorical distributions
print("=== Categorical Variable Distributions ===")
explorer.plot_categorical_distributions()

In [None]:
# Detect outliers
print("=== Outlier Detection ===")
outliers = explorer.outlier_detection(method='iqr', visualize=True)

# Print outlier summary
for col, outlier_data in outliers.items():
    if len(outlier_data) > 0:
        print(f"{col}: {len(outlier_data)} outliers detected")
    else:
        print(f"{col}: No outliers detected")

## 3. Statistical Analysis with BasicStatistics Class

Now let's perform comprehensive statistical analysis.

In [None]:
# Initialize the BasicStatistics analyzer
stats_analyzer = BasicStatistics(df)

# Get descriptive statistics
print("=== Descriptive Statistics ===")
desc_stats = stats_analyzer.descriptive_statistics()

# Display key statistics for each numeric column
for col, stats in desc_stats.items():
    if 'error' not in stats:
        print(f"\n{col}:")
        print(f"  Mean: {stats['mean']:.3f}, Median: {stats['median']:.3f}")
        print(f"  Std: {stats['std']:.3f}, Range: {stats['range']:.3f}")
        print(f"  Skewness: {stats['skewness']:.3f}, Kurtosis: {stats['kurtosis']:.3f}")

In [None]:
# Test for normality
print("=== Normality Tests ===")
normality_results = stats_analyzer.normality_tests()

for col, results in normality_results.items():
    if 'error' not in results:
        print(f"\n{col}:")
        if isinstance(results['shapiro_wilk']['is_normal'], bool):
            print(f"  Shapiro-Wilk: p={results['shapiro_wilk']['p_value']:.6f}, Normal: {results['shapiro_wilk']['is_normal']}")
        print(f"  Kolmogorov-Smirnov: p={results['kolmogorov_smirnov']['p_value']:.6f}, Normal: {results['kolmogorov_smirnov']['is_normal']}")

In [None]:
# Analyze correlations with statistical significance
print("=== Statistical Correlation Analysis ===")
corr_stats = stats_analyzer.correlation_statistics()

print(f"Mean absolute correlation: {corr_stats['summary']['mean_absolute_correlation']:.4f}")
print(f"Number of strong correlations (|r| >= 0.7): {corr_stats['summary']['num_strong_pairs']}")
print(f"Number of high correlations (0.5 <= |r| < 0.7): {corr_stats['summary']['num_high_pairs']}")

if corr_stats['strong_correlations']:
    print("\nStrong correlations:")
    for var1, var2, corr in corr_stats['strong_correlations']:
        print(f"  {var1} - {var2}: {corr:.4f}")

In [None]:
# Analyze categorical variables
print("=== Categorical Statistics ===")
cat_stats = stats_analyzer.categorical_statistics()

for col, stats in cat_stats.items():
    if 'error' not in stats:
        print(f"\n{col}:")
        print(f"  Unique values: {stats['unique_values']}")
        print(f"  Mode: {stats['mode']} ({stats['mode_percentage']:.1f}%)")
        print(f"  Entropy: {stats['entropy']:.3f}")

In [None]:
# Perform hypothesis tests with grade as target variable
print("=== Hypothesis Tests (Target: grade_v_scale) ===")
hypothesis_results = stats_analyzer.hypothesis_tests('grade_v_scale')

# Sort by p-value to see most significant relationships
significant_features = []
for feature, result in hypothesis_results.items():
    if 'error' not in result:
        significant_features.append((feature, result['p_value'], result['test_name'], result['significant']))

significant_features.sort(key=lambda x: x[1])  # Sort by p-value

print("\nFeature significance (sorted by p-value):")
for feature, p_value, test_name, significant in significant_features:
    significance = "***" if significant else "   "
    print(f"  {significance} {feature}: p={p_value:.6f} ({test_name})")

print("\n*** indicates p < 0.05 (statistically significant)")

## 4. Generate Comprehensive Reports

Let's generate comprehensive reports for both data exploration and statistical analysis.

In [None]:
# Generate data exploration report
print("=== DATA EXPLORATION REPORT ===")
exploration_report = explorer.generate_report()
print(exploration_report)

In [None]:
# Generate statistical analysis report
print("\n=== STATISTICAL ANALYSIS REPORT ===")
stats_report = stats_analyzer.generate_statistics_report(target_column='grade_v_scale')
print(stats_report)

## 5. Key Insights for Machine Learning

Based on our analysis, here are key insights that will inform our machine learning approach:

In [None]:
print("=== KEY INSIGHTS FOR MACHINE LEARNING ===")
print("\n1. DATA QUALITY:")
missing_summary = explorer.missing_values_analysis(visualize=False)
total_missing = missing_summary['Missing_Count'].sum()
print(f"   - Total missing values: {total_missing} ({(total_missing/(len(df)*len(df.columns)))*100:.2f}% of dataset)")
print(f"   - No major data quality issues detected")

print("\n2. FEATURE RELATIONSHIPS:")
corr_stats = stats_analyzer.correlation_statistics()
print(f"   - {corr_stats['summary']['num_strong_pairs']} strong correlations (|r| >= 0.7) found")
print(f"   - {corr_stats['summary']['num_high_pairs']} moderate correlations (0.5 <= |r| < 0.7) found")
if corr_stats['strong_correlations']:
    print("   - Watch for multicollinearity in these pairs:")
    for var1, var2, corr in corr_stats['strong_correlations'][:3]:
        print(f"     * {var1} - {var2}: {corr:.3f}")

print("\n3. TARGET VARIABLE (grade_v_scale):")
grade_stats = desc_stats['grade_v_scale']
print(f"   - Range: V{int(grade_stats['min'])} to V{int(grade_stats['max'])}")
print(f"   - Distribution: Mean={grade_stats['mean']:.1f}, Median={grade_stats['median']:.1f}")
print(f"   - Skewness: {grade_stats['skewness']:.3f} ({'right-skewed' if grade_stats['skewness'] > 0.5 else 'approximately symmetric' if abs(grade_stats['skewness']) <= 0.5 else 'left-skewed'})")

print("\n4. MOST PREDICTIVE FEATURES:")
significant_features = [(f, r['p_value']) for f, r in hypothesis_results.items() 
                       if 'error' not in r and r['significant']]
significant_features.sort(key=lambda x: x[1])
print(f"   - {len(significant_features)} features show significant relationship with grade")
for feature, p_val in significant_features[:5]:  # Top 5
    print(f"     * {feature}: p={p_val:.6f}")

print("\n5. RECOMMENDATIONS FOR MODEL TRAINING:")
print("   - Consider both regression and classification approaches")
print("   - Handle missing values (imputation or removal)")
if corr_stats['summary']['num_strong_pairs'] > 0:
    print("   - Address multicollinearity (PCA, feature selection, or regularization)")
print("   - Scale/normalize numeric features")
print("   - Encode categorical variables appropriately")
print("   - Consider feature engineering based on domain knowledge")

## 6. Save Analysis Results

Let's save our analysis results for future reference.

In [None]:
# Save the sample dataset
df.to_csv('../data/sample_climbing_data.csv', index=False)
print("Sample dataset saved to '../data/sample_climbing_data.csv'")

# Save reports
with open('../data/exploration_report.txt', 'w') as f:
    f.write(exploration_report)
print("Data exploration report saved to '../data/exploration_report.txt'")

with open('../data/statistics_report.txt', 'w') as f:
    f.write(stats_report)
print("Statistical analysis report saved to '../data/statistics_report.txt'")

# Save correlation matrix
corr_matrix.to_csv('../data/correlation_matrix.csv')
print("Correlation matrix saved to '../data/correlation_matrix.csv'")

print("\nAll analysis results have been saved successfully!")