In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, chi2, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
data = pd.read_csv(url)

# Display first few rows
print("Dataset Loaded Successfully!\n")
print(data.head())

# 'status' column is the target variable (1 = Parkinson's, 0 = Healthy)
X = data.drop(columns=['name', 'status'])
y = data['status']

# Scale features for chi-square (as it requires non-negative values)
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Variance Threshold
print("\n--- Variance Threshold ---")
vt = VarianceThreshold(threshold=0.01)  # Remove features with low variance
X_vt = vt.fit_transform(X_scaled)
selected_features_vt = X.columns[vt.get_support()]
print(f"Features selected ({len(selected_features_vt)} features): {list(selected_features_vt)}")

# Information Gain
print("\n--- Information Gain ---")
info_gain = mutual_info_classif(X_scaled, y)
info_gain_series = pd.Series(info_gain, index=X.columns)
info_gain_series = info_gain_series.sort_values(ascending=False)
print(info_gain_series)

# Select top 10 based on Information Gain
top_info_gain = info_gain_series.head(10)
print("\nTop 10 Features based on Information Gain:")
print(top_info_gain)

# Chi-Square Test
print("\n--- Chi-Square Test ---")
chi_scores, p_values = chi2(X_scaled, y)
chi2_series = pd.Series(chi_scores, index=X.columns).sort_values(ascending=False)
print(chi2_series)

# Select top 10 based on Chi-Square
top_chi2 = chi2_series.head(10)
print("\nTop 10 Features based on Chi-Square Test:")
print(top_chi2)

# Summary
print("\n--- Summary ---")
print(f"Total Features: {X.shape[1]}")
print(f"Features after Variance Threshold: {len(selected_features_vt)}")
print("\nTop 10 Features by Information Gain:")
print(list(top_info_gain.index))
print("\nTop 10 Features by Chi-Square:")
print(list(top_chi2.index))


Vocabulary: ['blue', 'bright', 'can', 'in', 'is', 'see', 'shining', 'sky', 'sun', 'the', 'we']

Bag of Words Matrix:

   blue  bright  can  in  is  see  shining  sky  sun  the  we
0     1       0    0   0   1    0        0    1    0    1   0
1     0       1    0   0   1    0        0    0    1    1   0
2     0       1    0   1   1    0        0    1    1    2   0
3     0       1    1   0   0    1        1    0    2    2   1
