# Parkinson's Disease Prediction

## 1. Setup and Data Loading

In [None]:
# Under "1. Setup and Data Loading"
from IPython import get_ipython
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('/content/parkinson_disease.csv')
pd.set_option('display.max_columns', 10)

## 2. Data Preprocessing and Feature Selection

In [None]:
print("Sample data:")
display(df.sample(5))

In [None]:
print("\nDataset Info:")
df.info()
print("\nDataset Description:")
display(df.describe().T)

In [None]:
print("\nMissing values count:", df.isnull().sum().sum())

In [None]:
df = df.groupby('id').mean().reset_index()
df.drop('id', axis=1, inplace=True)

In [None]:
# Feature Selection based on correlation
columns = list(df.columns)
for col in columns:
    if col == 'class':
        continue

    filtered_columns = [col]
    for col1 in df.columns:
        if((col == col1) | (col == 'class')):
            continue

        val = df[col].corr(df[col1])
        if val > 0.7:
            # If the correlation between the two features is more than 0.7, remove it
            if col1 in columns:
              columns.remove(col1)
            continue
        else:
            filtered_columns.append(col1)

    df = df[filtered_columns]
print("Shape after correlation-based feature selection:", df.shape)

In [None]:
# Feature selection using SelectKBest with chi2
X = df.drop('class', axis=1)
X_norm = MinMaxScaler().fit_transform(X)
selector = SelectKBest(chi2, k=30)
selector.fit(X_norm, df['class'])
filtered_columns = selector.get_support()
filtered_data = X.loc[:, filtered_columns]
filtered_data['class'] = df['class']
df = filtered_data
print("Shape after SelectKBest feature selection:", df.shape)

In [None]:
# Split data and handle class imbalance
features = df.drop('class', axis=1)
target = df['class']

X_train, X_val,Y_train, Y_val = train_test_split(features, target,
                                      test_size=0.2,
                                      random_state=10)

ros = RandomOverSampler(sampling_strategy=1.0,
                        random_state=0)
X, Y = ros.fit_resample(X_train, Y_train)
print("\nShape after oversampling:", X.shape)
print("Class distribution after oversampling:")
print(Y.value_counts())

In [None]:
# Another dataset loading and preprocessing
parkinsons_data = pd.read_csv('/content/parkinsons.data')
parkinsons_data.drop(columns=['name'], inplace=True)

In [None]:
# Split into features and target
X_parkinsons = parkinsons_data.drop(columns=['status'])
y_parkinsons = parkinsons_data['status']

In [None]:
# Use RFE to select top 10 features
model_for_rfe = svm.SVC(kernel='linear')
rfe = RFE(estimator=model_for_rfe, n_features_to_select=10)
rfe.fit(X_parkinsons, y_parkinsons)

In [None]:
# Select the top 10 features
selected_features = X_parkinsons.columns[rfe.support_]
print("\nTop 10 selected features using RFE:", selected_features.tolist())

## 3. Exploratory Data Analysis

In [None]:
x = df['class'].value_counts()
plt.pie(x.values,
        labels = x.index,
        autopct='%1.1f%%')
plt.title("Class Distribution")
plt.show()

## 4. Model Training and Evaluation

In [None]:
# Train-test split using only selected features
X_selected = X_parkinsons[selected_features]
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_selected, y_parkinsons, test_size=0.2, random_state=2)

# Scale the data for the SVM model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

In [None]:
# Under "4. Model Training and Evaluation"
models = [LogisticRegression(class_weight='balanced'), XGBClassifier(), SVC(kernel='rbf', probability=True), RandomForestClassifier(), DecisionTreeClassifier()]
model_performance = {}

# Define ras function if it's not defined elsewhere
def ras(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


for model in models:
    model.fit(X, Y)
    print(f'{type(model).__name__} : ')

    train_preds = model.predict(X)
    print('Training Accuracy : ', ras(Y, train_preds))

    val_preds = model.predict(X_val)
    val_accuracy = ras(Y_val, val_preds)
    print('Validation Accuracy : ', val_accuracy)
    print()

    model_performance[type(model).__name__] = val_accuracy

# Find the model with the best validation accuracy
best_model_name = max(model_performance, key=model_performance.get)
best_accuracy = model_performance[best_model_name]

print(f"The best model based on validation accuracy is: {best_model_name} with an accuracy of {best_accuracy:.4f}")

In [None]:
print("\nClassification Report for Logistic Regression (first model):")
print(classification_report(Y_val, models[0].predict(X_val)))

In [None]:
# Train and evaluate the SVM model with RFE selected features
final_model_svm = svm.SVC(kernel='linear')
final_model_svm.fit(X_train_scaled, y_train_selected)

In [None]:
train_pred_svm = final_model_svm.predict(X_train_scaled)
test_pred_svm = final_model_svm.predict(X_test_scaled)

print('\nAccuracy Score on training data (SVM with top features and scaling): ', accuracy_score(y_train_selected, train_pred_svm))
print('Accuracy Score on testing data (SVM with top features and scaling): ', accuracy_score(y_test_selected, test_pred_svm))

## 5. Model Saving and Prediction

In [None]:
with open('parkinson_pred_top_features.pkl', 'wb') as f:
    pickle.dump(final_model_svm, f)
print("SVM model trained with top 10 features saved as parkinson_pred_top_features.pkl")

In [None]:
# Example prediction with the saved SVM model
# Define the list of features your model was trained on
model_features = ['MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:DDA', 'NHR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']

# Your input data (make sure these values correspond to the features above)
# You need to provide only the values for the 10 features in model_features
input_data_values = (0.01098, 0.09700, 0.00563, 0.01689, 0.422229, 0.741367, -7.348300, 0.177551, 1.743867, 0.085569) # Example values - REPLACE with actual values

# Convert the input data to a NumPy array
numpy_array = np.asarray(input_data_values)

# Reshape the NumPy array to have 1 row and the number of features as columns
data_reshaped = numpy_array.reshape(1, -1)

# Scale the input data using the scaler fitted on the RFE selected features
std_data = scaler.transform(data_reshaped)

# Load the trained SVM model
with open('parkinson_pred_top_features.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Make a prediction using the loaded model
prediction = loaded_model.predict(std_data)

print("\nPrediction for sample input:")
print(prediction)

if prediction[0] == 0:
    print("The Person does not have Parkinsons Disease")
else:
    print("The Person has Parkinsons")