                                                             DIABETES PREDICTION MODEL

In [1]:
import numpy as np #imports the NumPy library 
import pandas as pd #imports the pandas library
from sklearn.model_selection import train_test_split # imports the train_test_split function from the model_selection
from sklearn import svm #imports the svm module from scikit-learn (sklearn)
from sklearn.metrics import accuracy_score #imports the accuracy_score function from the metrics

import warnings #imports the built-in warnings module in Python. 
warnings.filterwarnings("ignore") # filter for warnings so that warnings generated during program execution are ignored and not displayed.

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv') #reading data from the data


In [3]:
diabetes_dataset.head() # displays the top rows of the DataFrame


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
diabetes_dataset.shape # used to retrieve the dimensions (i.e., the number of rows and columns)

(768, 9)

In [5]:
diabetes_dataset.describe() #used to generate descriptive statistics for the numeric columns of the DataFrame 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
diabetes_dataset['Outcome'].value_counts() #used to count the occurrences of each unique value in the column

Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:
diabetes_dataset.groupby('Outcome').mean() #used to calculate the mean (average) values of all numeric columns

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
X = diabetes_dataset.drop(columns = 'Outcome', axis=1) #X is a DataFrame containing the independent variables
Y = diabetes_dataset['Outcome'] #Y is a Series containing the target variable

In [9]:
print(X) #Printing the DataFrame 

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [10]:
print(Y) #printing the dataframe

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2,
stratify=Y, random_state=2) #This function splits the dataset X (features) and Y (target) into training and testing sets.
print(X.shape, X_train.shape, X_test.shape) #This line prints the shapes (dimensions) of the original features (X)


(768, 8) (614, 8) (154, 8)


In [12]:
classifier = svm.SVC(kernel='linear') #This line of code initializes a Support Vector Classifier (SVC) from scikit-learn with a linear kernel.

In [13]:
classifier.fit(X_train, Y_train) #The line fits (trains) the SVM classifier (classifier) on the training data (X_train and Y_train).


In [14]:
X_train_prediction = classifier.predict(X_train) #This line calculates the accuracy of the predictions made by the classifier against the actual labels 
training_data_accuracy = accuracy_score(X_train_prediction, Y_train) #the accuracy score 

In [15]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7833876221498371


In [16]:
X_test_prediction = classifier.predict(X_test) #This line predicts the labels (Y_test) for the test input features (X_test) using the trained SVM classifier 
test_data_accuracy = accuracy_score(X_test_prediction, Y_test) #This line calculates the accuracy score of the test predictions (X_test_prediction) compared to the actual test labels (Y_test).


In [17]:
print('Accuracy score of the test data : ', test_data_accuracy) # Calculate accuracy score of the test data


Accuracy score of the test data :  0.7727272727272727


In [18]:
input_data = (5,166,72,19,175,25.8,0.587,51) # Define input data as a tuple

In [19]:
input_data_as_numpy_array = np.asarray(input_data) # Convert the input_data tuple into a NumPy array


In [20]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)# Reshape the input_data_as_numpy_array to be a row vector

In [21]:
prediction = classifier.predict(input_data_reshaped) # Use the trained classifier to make predictions on the reshaped input data
print(prediction) # Print the predicted outcome


[1]


In [22]:
if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')
# Check the predicted value and print the corresponding message

The person is diabetic


In [23]:
import pickle # Import the pickle module for object serialization and deserialization


In [24]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb')) # Serialize the classifier object to a binary file named 'diabetes_model.sav'


In [25]:

# Load the saved model from the file 'diabetes_model.sav'
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))
# Define input data for prediction
input_data = (5,166,72,19,175,25.8,0.587,51)


In [26]:
input_data_as_numpy_array = np.asarray(input_data) # Convert the input_data tuple to a numpy array

In [27]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1) # Reshape the numpy array to have one row and multiple columns (-1)


In [28]:
prediction = loaded_model.predict(input_data_reshaped) # Use the loaded model to make a prediction on the reshaped input data
print(prediction) # Print the prediction result


[1]


In [29]:
# Check the prediction result
if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')

# Display the column names in X
for column in X.columns:
    print(column)


The person is diabetic
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age


                                                        HEART DISEASE PREDICTION MODEL	

In [30]:
import numpy as np # Importing NumPy library for numerical computations
import pandas as pd # Importing Pandas library for data manipulation
from sklearn.model_selection import train_test_split # Importing train_test_split for splitting data
from sklearn.linear_model import LogisticRegression # Importing LogisticRegression model from sklearn
from sklearn.metrics import accuracy_score# Importing accuracy_score for evaluating model performance

import warnings # Importing warnings module to suppress unnecessary warnings
warnings.filterwarnings("ignore")  # Ignoring warnings to avoid clutter in the output

In [31]:
heart_data = pd.read_csv('heart.csv') # Reading a CSV file named 'heart.csv' into a Pandas DataFrame called heart_data

In [32]:
heart_data.head() # Displaying the first 5 rows of the heart_data DataFrame

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [33]:
heart_data.tail()# Displaying the last 5 rows of the heart_data DataFrame

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
298,299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes
302,303,38,1,nonanginal,138,175,0,0,173,0,0.0,1,,normal,No


In [34]:
heart_data.shape # Getting the dimensions (number of rows and columns)

(303, 15)

In [35]:
heart_data.info() # Displaying clear information including datatype about the heart_data DataFrame


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  303 non-null    int64  
 1   Age         303 non-null    int64  
 2   Sex         303 non-null    int64  
 3   ChestPain   303 non-null    object 
 4   RestBP      303 non-null    int64  
 5   Chol        303 non-null    int64  
 6   Fbs         303 non-null    int64  
 7   RestECG     303 non-null    int64  
 8   MaxHR       303 non-null    int64  
 9   ExAng       303 non-null    int64  
 10  Oldpeak     303 non-null    float64
 11  Slope       303 non-null    int64  
 12  Ca          299 non-null    float64
 13  Thal        301 non-null    object 
 14  AHD         303 non-null    object 
dtypes: float64(2), int64(10), object(3)
memory usage: 35.6+ KB


In [36]:
heart_data.isnull().sum()# Counting the number of missing (null) values in each column of heart_data


Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64

In [37]:
heart_data.describe()# Generating descriptive statistics for numerical columns in heart_data


Unnamed: 0.1,Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0
mean,152.0,54.438944,0.679868,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241
std,87.612784,9.038662,0.467299,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438
min,1.0,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,76.5,48.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,152.0,56.0,1.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,227.5,61.0,1.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0
max,303.0,77.0,1.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0


In [38]:
print(heart_data.columns) # Printing the column names of the heart_data DataFrame


Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')


In [39]:
heart_data['Unnamed: 0'].value_counts() # Counting the occurrences of each value in the 'Unnamed: 0' column

Unnamed: 0
1      1
209    1
207    1
206    1
205    1
      ..
101    1
100    1
99     1
98     1
303    1
Name: count, Length: 303, dtype: int64

In [40]:
print(X) # Print the features (input data)
print(Y) # Print the target variable


     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
stratify=Y, random_state=2) # Splitting the data into training and testing sets
print(X.shape, X_train.shape, X_test.shape) # Printing the shapes (dimensions) of the original dataset, training set, and testing set


(768, 8) (614, 8) (154, 8)


In [42]:
model = LogisticRegression() # Creating an instance of LogisticRegression model

In [43]:
model.fit(X_train, Y_train) # Training the logistic regression model using the training data


In [44]:
X_train_prediction = model.predict(X_train) # Making predictions on the training data using the trained model
training_data_accuracy = accuracy_score(X_train_prediction, Y_train) # Calculating accuracy on the training data
print('Accuracy on Training data : ', training_data_accuracy) # Printing the accuracy on the training data

Accuracy on Training data :  0.7850162866449512


In [45]:
X_test_prediction = model.predict(X_test) # Making predictions on the test data using the trained model
test_data_accuracy = accuracy_score(X_test_prediction, Y_test) # Calculating accuracy on the test data
print('Accuracy on Test data : ', test_data_accuracy) 
# Printing the accuracy on the test data

Accuracy on Test data :  0.7532467532467533


In [46]:
input_data = (62,0,0,140,268,0,0,160) # Define input data as a tuple representing features

In [47]:
input_data_as_numpy_array= np.asarray(input_data) # Convert input_data to a NumPy array

In [48]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1) # Reshape input data to a 2D array with a single sample

In [49]:

prediction = model.predict(input_data_reshaped) # Making predictions using the logistic regression model
print(prediction)

if prediction[0] == 0:
    print('The Person does not have a Heart Disease')
else:
    print('The Person has Heart Disease')



[1]
The Person has Heart Disease


In [50]:
import pickle # Importing the pickle module for saving and loading model


In [51]:
filename = 'heart_disease_model.sav'
pickle.dump(model, open(filename, 'wb')) # Save the trained model to a file using pickle

In [52]:
loaded_model = pickle.load(open('heart_disease_model.sav', 'rb'))  # Load the saved model from file


In [53]:
for column in X.columns:
    print(column) # Print each column name in the DataFrame X


Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age


                                                     PARKINSONS DISEASE PREDICTION

In [54]:

import numpy as np # Importing NumPy library and aliasing it as np
import pandas as pd # Importing pandas library and aliasing it as pd
from sklearn.model_selection import train_test_split # Importing train_test_split function from sklearn.model_selection
from sklearn import svm # Importing Support Vector Machine (SVM) classifier from sklearn
from sklearn.metrics import accuracy_score # Importing accuracy_score metric from sklearn.metrics


In [55]:
parkinson_df = pd.read_csv('parkinsons2.csv') #parkinson_df = pd.read_csv('parkinsons2.csv')


In [56]:
parkinson_df.head() # Display the first 5 rows of the DataFrame `parkinson_df`


Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,1
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,1
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,1
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,1
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,1


In [57]:
parkinson_df.shape  # Retrieve the dimensions (number of rows, number of columns) of the DataFrame parkinson_df

(195, 23)

In [58]:
parkinson_df.info() # Display clear summary information including datatype about the DataFrame parkinson_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MDVP:Fo(Hz)       195 non-null    float64
 1   MDVP:Fhi(Hz)      195 non-null    float64
 2   MDVP:Flo(Hz)      195 non-null    float64
 3   MDVP:Jitter(%)    195 non-null    float64
 4   MDVP:Jitter(Abs)  195 non-null    float64
 5   MDVP:RAP          195 non-null    float64
 6   MDVP:PPQ          195 non-null    float64
 7   Jitter:DDP        195 non-null    float64
 8   MDVP:Shimmer      195 non-null    float64
 9   MDVP:Shimmer(dB)  195 non-null    float64
 10  Shimmer:APQ3      195 non-null    float64
 11  Shimmer:APQ5      195 non-null    float64
 12  MDVP:APQ          195 non-null    float64
 13  Shimmer:DDA       195 non-null    float64
 14  NHR               195 non-null    float64
 15  HNR               195 non-null    float64
 16  RPDE              195 non-null    float64
 1

In [59]:
parkinson_df.isnull().sum() # Calculate the number of missing values (NaN) in each column of parkinson_df

MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
status              0
dtype: int64

In [60]:
parkinson_df.describe # Generate descriptive statistics of the numerical columns in parkinson_df

<bound method NDFrame.describe of      MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0        119.992       157.302        74.997         0.00784   
1        122.400       148.650       113.819         0.00968   
2        116.682       131.111       111.555         0.01050   
3        116.676       137.871       111.366         0.00997   
4        116.014       141.781       110.655         0.01284   
..           ...           ...           ...             ...   
190      174.188       230.978        94.261         0.00459   
191      209.516       253.017        89.488         0.00564   
192      174.688       240.005        74.287         0.01360   
193      198.764       396.961        74.904         0.00740   
194      214.289       260.277        77.973         0.00567   

     MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
0             0.00007   0.00370   0.00554     0.01109       0.04374   
1             0.00008   0.00465   0.00696     0.01394  

In [61]:
parkinson_df['status'].value_counts() # Count the number of occurrences of each unique value in the 'status' column of parkinson_df


status
1    147
0     48
Name: count, dtype: int64

In [62]:
parkinson_df.groupby('status').mean() # Calculate the mean values of numerical columns grouped by the 'status' column


Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


In [63]:
X = parkinson_df.drop(columns=['status'], axis=1) # Extract features (independent variables) by dropping the 'status' column
Y = parkinson_df['status'] # Extract target variable (dependent variable) 'status'

In [64]:
print(X)  # Display the DataFrame containing features (independent variables) X
print(Y) # Display the Series containing the target variable (dependent variable) Y

     MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0        119.992       157.302        74.997         0.00784   
1        122.400       148.650       113.819         0.00968   
2        116.682       131.111       111.555         0.01050   
3        116.676       137.871       111.366         0.00997   
4        116.014       141.781       110.655         0.01284   
..           ...           ...           ...             ...   
190      174.188       230.978        94.261         0.00459   
191      209.516       253.017        89.488         0.00564   
192      174.688       240.005        74.287         0.01360   
193      198.764       396.961        74.904         0.00740   
194      214.289       260.277        77.973         0.00567   

     MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
0             0.00007   0.00370   0.00554     0.01109       0.04374   
1             0.00008   0.00465   0.00696     0.01394       0.06134   
2             0.00

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state=2) # Split X (features) and Y (target) into train and test sets
print(X.shape, X_train.shape, X_test.shape) # Print the shapes of X (original), X_train, and X_test

(195, 22) (156, 22) (39, 22)


In [66]:
model = svm.SVC(kernel='linear') # Create a Support Vector Machine (SVM) classifier with a linear kernel


In [67]:
model.fit(X_train, Y_train) # Train (fit) the SVM classifier `model` using the training data X_train and corresponding labels Y_train

In [68]:
X_train_prediction = model.predict(X_train) # Use the trained SVM classifier `model` to predict labels for the training data X_train
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)  # Calculate the accuracy of the training data predictions

In [69]:
print('Accuracy score of training data : ', training_data_accuracy)# Print the accuracy score of the training data predictions

Accuracy score of training data :  0.8717948717948718


In [70]:
X_test_prediction = model.predict(X_test) # Use the trained SVM classifier `model` to predict labels for the testing data X_test
test_data_accuracy = accuracy_score(Y_test, X_test_prediction) # Calculate the accuracy of the testing data predictions


In [71]:
print('Accuracy score of test data : ', test_data_accuracy) # Print the accuracy score of the testing data predictions


Accuracy score of test data :  0.8717948717948718


In [72]:
input_data = (
    197.07600, 206.89600, 192.05500, 0.00289, 0.00001, 0.00166, 0.00168, 
    0.00498, 0.01098, 0.09700, 0.00563, 0.00680, 0.00802, 0.01689, 0.00339, 
    26.77500, 0.422229, 0.741367, -7.348300, 0.177551, 1.743867, 0.085569
)# Define input data as a tuple of feature values for prediction



In [73]:
# changing input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

In [74]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1) # Reshape input data into a 2D array (row) for prediction

In [75]:
prediction = model.predict(input_data_reshaped) # Use the trained model to predict the label for the input data
print(prediction) # Print the predicted label


[0]


In [76]:
result = "The Person does not have Parkinson's Disease" if prediction[0] == 0 else "The Person has Parkinson's" # Determine the result based on the predicted label
print(result) # Print the result


The Person does not have Parkinson's Disease


In [77]:

import pickle # Import the pickle module for saving/loading the trained model

In [78]:
filename = 'parkinsons_model.sav'
pickle.dump(model, open(filename, 'wb')) # Save the trained model to a file using pickle

In [79]:
loaded_model = pickle.load(open('parkinsons_model.sav', 'rb')) # Load the saved model from the file using pickle


In [80]:
for column in X.columns:
    print(column) # Print each column name in the DataFrame X



MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE
