# Group 18 — Dementia Prediction Project
## Step 1: Preprocessing + Descriptive Statistics

> **Dataset:** [Dementia Dataset on Kaggle](https://www.kaggle.com/datasets/fatemehmehrparvar/dementia)  
> **Note:** This model is purely statistical and is in no way meant for clinical diagnoses.

In [2]:
# If running in Colab, uncomment and run this cell first:
# !pip install pandas numpy matplotlib seaborn scipy

import os

# ── 路径配置 ──────────────────────────────
DATA_DIR    = '../Data/'
FIGURES_DIR = '../Outputs/Figures/'
MODELS_DIR  = '../Outputs/Models/'
RESULTS_DIR = '../Outputs/Results/'

# 确保文件夹存在
for d in [FIGURES_DIR, MODELS_DIR, RESULTS_DIR]:
    os.makedirs(d, exist_ok=True)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded successfully")

✓ Libraries loaded successfully


## 0. Load Data

> **Colab users:** Upload `OPTIMAL_combined_3studies_6feb2020.csv` first, then run the cell below.

In [4]:
df = pd.read_csv(DATA_DIR + 'OPTIMAL_combined_3studies_6feb2020.csv')
df.to_csv(RESULTS_DIR + 'dementia_clean.csv', index=False)
print(f"Raw data: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

Raw data: 1842 rows, 22 columns


Unnamed: 0,ID,age,gender,dementia,dementia_all,educationyears,EF,PS,Global,diabetes,...,hypercholesterolemia,lacunes_num,fazekas_cat,study,study1,SVD Simple Score,SVD Amended Score,Fazekas,lac_count,CMB_count
0,1,52.67,male,0.0,0,11.0,-2.403333,-1.29,-1.287,0,...,Yes,more-than-zero,2 to 3,scans,scans,3.0,7.0,3,>5,>=1
1,10,64.58,male,0.0,0,10.0,1.28,0.36,0.744,0,...,Yes,more-than-zero,0 to 1,scans,scans,2.0,3.0,1,1 to 2,>=1
2,100,74.92,male,0.0,0,8.0,-1.44,-1.52,-0.922,0,...,Yes,more-than-zero,0 to 1,scans,scans,1.0,2.0,1,1 to 2,0
3,101,74.83,male,1.0,1,9.0,,-2.136271,-1.301102,0,...,Yes,more-than-zero,2 to 3,scans,scans,2.0,4.0,2,3 to 5,0
4,102,79.25,male,0.0,0,10.0,-0.92,-1.493333,-0.924,0,...,Yes,more-than-zero,2 to 3,scans,scans,2.0,3.0,2,1 to 2,0


## 1. Descriptive Statistics Table

- **Continuous variables:** mean ± SD, independent t-test  
- **Categorical variables:** n (%), chi-squared test  
- Results saved to `descriptive_stats_table.csv`

In [None]:
rows = []

# ── Continuous variables ──────────────────────────────────────
num_vars = [
    ('age',            'Age (years)'),
    ('educationyears', 'Education (years)'),
    ('EF',             'Executive Function (EF)'),
    ('PS',             'Processing Speed (PS)'),
    ('Global',         'Global Cognitive Score'),
]
for var, label in num_vars:
    g0 = df[df['dementia'] == 0][var].dropna()
    g1 = df[df['dementia'] == 1][var].dropna()
    t, p = stats.ttest_ind(g0, g1)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
    rows.append({
        'Variable':              label,
        'Type':                  'Continuous',
        'Group':                 'mean +/- sd',
        'No Dementia (n=1726)':  f"{g0.mean():.2f} +/- {g0.std():.2f}",
        'Dementia (n=82)':       f"{g1.mean():.2f} +/- {g1.std():.2f}",
        'Test':                  't-test',
        'p-value':               f"{p:.4f}",
        'Significance':          sig,
    })

# ── Categorical variables ─────────────────────────────────────
cat_vars = [
    ('gender',               'Gender',               {'female': 'Female', 'male': 'Male'}),
    ('smoking',              'Smoking',               {'never-smoker': 'Never', 'ex-smoker': 'Ex-smoker', 'current-smoker': 'Current'}),
    ('hypertension',         'Hypertension',          {'Yes': 'Yes', 'No': 'No'}),
    ('hypercholesterolemia', 'Hypercholesterolaemia', {'Yes': 'Yes', 'No': 'No'}),
    ('diabetes',             'Diabetes',              {0: 'No', 1: 'Yes'}),
    ('fazekas_cat',          'Fazekas Category',      {'0 to 1': '0-1', '2 to 3': '2-3'}),
    ('lacunes_num',          'Lacunes',               {'zero': 'Zero', 'more-than-zero': '>0'}),
    ('CMB_count',            'CMB Count',             {'0': '0', '>=1': '>=1'}),
]
for var, label, val_map in cat_vars:
    ct = pd.crosstab(df[var], df['dementia'])
    chi2, p, _, _ = stats.chi2_contingency(ct)
    sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
    first = True
    for val in df[var].dropna().unique():
        n0   = ((df['dementia'] == 0) & (df[var] == val)).sum()
        n1   = ((df['dementia'] == 1) & (df[var] == val)).sum()
        pct0 = n0 / (df['dementia'] == 0).sum() * 100
        pct1 = n1 / (df['dementia'] == 1).sum() * 100
        rows.append({
            'Variable':              label if first else '',
            'Type':                  'Categorical',
            'Group':                 val_map.get(val, str(val)),
            'No Dementia (n=1726)':  f"{n0} ({pct0:.1f}%)",
            'Dementia (n=82)':       f"{n1} ({pct1:.1f}%)",
            'Test':                  'Chi2 test' if first else '',
            'p-value':               f"{p:.4f}" if first else '',
            'Significance':          sig if first else '',
        })
        first = False

table = pd.DataFrame(rows)
table.to_csv('descriptive_stats_table.csv', index=False)
print("✓ Saved: descriptive_stats_table.csv")
table

## 2. Preprocessing

**Steps:**
1. Drop redundant/high-missingness columns
2. Remove rows with missing target (`dementia`)
3. Encode binary & ordinal categorical variables
4. One-hot encode `study1`
5. Impute missing continuous values with median

In [None]:
df_clean = df.copy()

# ── Drop redundant columns ────────────────────────────────────
# fazekas_cat duplicates Fazekas (numeric); lacunes_num duplicates lac_count
# SVD Scores: 36.8% missing and overlap with other imaging vars
drop_cols = ['ID', 'study', 'dementia_all', 'fazekas_cat',
             'lacunes_num', 'SVD Simple Score', 'SVD Amended Score']
df_clean.drop(columns=drop_cols, inplace=True)
print(f"✓ Dropped columns: {drop_cols}")

# ── Drop rows missing target ──────────────────────────────────
df_clean.dropna(subset=['dementia'], inplace=True)
print(f"✓ After dropping missing dementia: {len(df_clean)} rows")

# ── Binary encoding ───────────────────────────────────────────
binary_map = {
    'gender':               {'male': 0, 'female': 1},
    'hypertension':         {'No': 0,  'Yes': 1},
    'hypercholesterolemia': {'No': 0,  'Yes': 1},
    'CMB_count':            {'0': 0,   '>=1': 1},
}
for col, mapping in binary_map.items():
    df_clean[col] = df_clean[col].map(mapping)
print("✓ Binary variables encoded")

# ── Ordinal encoding ──────────────────────────────────────────
smoking_map = {'never-smoker': 0, 'ex-smoker': 1, 'current-smoker': 2}
df_clean['smoking'] = df_clean['smoking'].map(smoking_map)

lac_map = {'Zero': 0, '1 to 2': 1, '3 to 5': 2, '>5': 3}
df_clean['lac_count'] = df_clean['lac_count'].map(lac_map)
print("✓ Ordinal variables encoded")

# ── One-hot encode study1 ─────────────────────────────────────
df_clean = pd.get_dummies(df_clean, columns=['study1'], drop_first=True)
print("✓ study1 one-hot encoded")

# ── Median imputation for continuous variables ────────────────
num_fill = ['EF', 'PS', 'Global', 'smoking']
for col in num_fill:
    median_val = df_clean[col].median()
    n_missing  = df_clean[col].isnull().sum()
    df_clean[col] = df_clean[col].fillna(median_val)
    if n_missing > 0:
        print(f"  {col}: imputed {n_missing} missing values (median = {median_val:.2f})")

print(f"\n✓ Final shape: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
print(f"✓ Remaining missing values: {df_clean.isnull().sum().sum()}")
print(f"\nFinal columns:\n{list(df_clean.columns)}")

df_clean.to_csv('dementia_clean.csv', index=False)
print("\n✓ Saved: dementia_clean.csv")

## 3. Exploratory Data Analysis (EDA)

**Design principles:**
- Continuous variables → **KDE density curves** (normalised) to handle class imbalance (95% vs 5%)
- Categorical variables → **within-group % bar charts** for direct comparison
- Correlation heatmap for key numeric variables

In [None]:
plt.rcParams['font.family'] = 'DejaVu Sans'
colors = ['#4C72B0', '#DD8452']
labels = ['No Dementia', 'Dementia']

fig = plt.figure(figsize=(18, 20))
gs  = gridspec.GridSpec(4, 3, figure=fig, hspace=0.45, wspace=0.35)

# ── Plot 1: Target distribution (class imbalance) ─────────────
ax0 = fig.add_subplot(gs[0, 0])
counts = df['dementia'].value_counts().sort_index()
bars = ax0.bar(['No Dementia\n(n=1726)', 'Dementia\n(n=82)'],
               counts.values, color=colors, edgecolor='white', linewidth=1.5)
for bar, val in zip(bars, counts.values):
    ax0.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
             f'{val}\n({val/sum(counts.values)*100:.1f}%)',
             ha='center', fontsize=10)
ax0.set_title('Dementia Distribution\n(Class Imbalance)', fontsize=12, fontweight='bold')
ax0.set_ylabel('Count')
ax0.set_ylim(0, 2000)

# ── Plot 2: Age KDE ───────────────────────────────────────────
ax1 = fig.add_subplot(gs[0, 1])
for i, (label, color) in enumerate(zip(labels, colors)):
    subset = df[df['dementia'] == i]['age'].dropna()
    subset.plot.kde(ax=ax1, color=color, linewidth=2.5, label=f"{label} (n={len(subset)})")
ax1.set_title('Age Distribution by Dementia\n(KDE — density normalised)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Age (years)')
ax1.set_ylabel('Density')
ax1.legend()
ax1.set_xlim(30, 100)

# ── Plot 3: EF KDE ────────────────────────────────────────────
ax2 = fig.add_subplot(gs[0, 2])
for i, (label, color) in enumerate(zip(labels, colors)):
    subset = df[df['dementia'] == i]['EF'].dropna()
    subset.plot.kde(ax=ax2, color=color, linewidth=2.5, label=f"{label} (n={len(subset)})")
ax2.set_title('Executive Function (EF)\n(KDE — density normalised)', fontsize=12, fontweight='bold')
ax2.set_xlabel('EF Score')
ax2.set_ylabel('Density')
ax2.legend()

# ── Plot 4: PS KDE ────────────────────────────────────────────
ax3 = fig.add_subplot(gs[1, 0])
for i, (label, color) in enumerate(zip(labels, colors)):
    subset = df[df['dementia'] == i]['PS'].dropna()
    subset.plot.kde(ax=ax3, color=color, linewidth=2.5, label=f"{label} (n={len(subset)})")
ax3.set_title('Processing Speed (PS)\n(KDE — density normalised)', fontsize=12, fontweight='bold')
ax3.set_xlabel('PS Score')
ax3.set_ylabel('Density')
ax3.legend()

# ── Plot 5: Hypertension % bar ────────────────────────────────
ax4 = fig.add_subplot(gs[1, 1])
ct = pd.crosstab(df['hypertension'], df['dementia'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
ct_pct.plot(kind='bar', ax=ax4, color=colors, edgecolor='white', legend=True)
ax4.set_title('Hypertension vs Dementia', fontsize=12, fontweight='bold')
ax4.set_xlabel('Hypertension')
ax4.set_ylabel('% within group')
ax4.set_xticklabels(['No', 'Yes'], rotation=0)
ax4.legend(labels)

# ── Plot 6: Diabetes % bar ────────────────────────────────────
ax5 = fig.add_subplot(gs[1, 2])
ct = pd.crosstab(df['diabetes'], df['dementia'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
ct_pct.plot(kind='bar', ax=ax5, color=colors, edgecolor='white', legend=True)
ax5.set_title('Diabetes vs Dementia', fontsize=12, fontweight='bold')
ax5.set_xlabel('Diabetes (0=No, 1=Yes)')
ax5.set_ylabel('% within group')
ax5.set_xticklabels(['No', 'Yes'], rotation=0)
ax5.legend(labels)

# ── Plot 7: Smoking % bar ─────────────────────────────────────
ax6 = fig.add_subplot(gs[2, 0])
ct = pd.crosstab(df['smoking'], df['dementia'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
ct_pct.plot(kind='bar', ax=ax6, color=colors, edgecolor='white', legend=True)
ax6.set_title('Smoking Status vs Dementia', fontsize=12, fontweight='bold')
ax6.set_xlabel('Smoking Status')
ax6.set_ylabel('% within group')
ax6.set_xticklabels(ct_pct.index, rotation=15, ha='right')
ax6.legend(labels)

# ── Plot 8: Fazekas % bar ─────────────────────────────────────
ax7 = fig.add_subplot(gs[2, 1])
ct = pd.crosstab(df['Fazekas'], df['dementia'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
ct_pct.plot(kind='bar', ax=ax7, color=colors, edgecolor='white', legend=True)
ax7.set_title('Fazekas Score vs Dementia', fontsize=12, fontweight='bold')
ax7.set_xlabel('Fazekas Score (0-3)')
ax7.set_ylabel('% within group')
ax7.set_xticklabels([0, 1, 2, 3], rotation=0)
ax7.legend(labels)

# ── Plot 9: Lacune count % bar ────────────────────────────────
ax8 = fig.add_subplot(gs[2, 2])
ct = pd.crosstab(df['lac_count'], df['dementia'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
ct_pct.plot(kind='bar', ax=ax8, color=colors, edgecolor='white', legend=True)
ax8.set_title('Lacune Count vs Dementia', fontsize=12, fontweight='bold')
ax8.set_xlabel('Lacune Count')
ax8.set_ylabel('% within group')
ax8.set_xticklabels(ct_pct.index, rotation=15, ha='right')
ax8.legend(labels)

# ── Plot 10: Correlation heatmap ──────────────────────────────
ax9 = fig.add_subplot(gs[3, :])
corr_vars = ['dementia', 'age', 'educationyears', 'EF', 'PS', 'Global', 'diabetes', 'Fazekas']
corr_df = df[corr_vars].copy()
corr_df['dementia'] = pd.to_numeric(corr_df['dementia'], errors='coerce')
corr_matrix = corr_df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, ax=ax9, annot=True, fmt='.2f',
            cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            square=True, linewidths=0.5, cbar_kws={'shrink': 0.6})
ax9.set_title('Correlation Matrix (Key Variables)', fontsize=12, fontweight='bold')

fig.suptitle('Dementia Prediction — Descriptive Statistics & EDA  |  Group 18',
             fontsize=15, fontweight='bold', y=0.98)

plt.savefig('descriptive_statistics.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()
print("✓ Saved: descriptive_statistics.png")

## Summary of Output Files

| File | Description |
|------|-------------|
| `dementia_clean.csv` | Preprocessed modelling dataset |
| `descriptive_stats_table.csv` | Descriptive statistics summary table |
| `descriptive_statistics.png` | EDA visualisation charts |