In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [35]:
# Load the dataset
df = pd.read_excel('copd-dataset.xlsx')

In [36]:
# Drop unnecessary columns
data = df.drop(['Sr. No.', 'Name'], axis=1)

In [37]:
data.isnull().sum()

SHORTNESS OF BREATH          0
EXPECTORATION                0
DIABETES                     0
TYPE OF SMOKER               0
CIGARETTE/BIDI/GANJA         0
ALCOHOL USE                  0
mMRC GRADE                   0
(FEV1 PRE BD) %PRED          0
(FEV1/FVC POST BD ) L/SEC    0
Gold Grade                   0
dtype: int64

In [38]:
# Define categorical and numerical features
categorical_features = ['SHORTNESS OF BREATH', 'EXPECTORATION','DIABETES', 'TYPE OF SMOKER', 'CIGARETTE/BIDI/GANJA', 'ALCOHOL USE']
numerical_features = ['mMRC GRADE', '(FEV1 PRE BD) %PRED', '(FEV1/FVC POST BD ) L/SEC']

In [39]:
# Define the mapping dictionary for each categorical column
categorical_mapping = {
    'SHORTNESS OF BREATH': {'NO': 0, 'YES': 1},
    'EXPECTORATION': {'NO': 0, 'YES': 1},
    'DIABETES': {'NO': 0, 'YES': 1},
    'TYPE OF SMOKER': {'NON SMOKER': 0, 'EX SMOKER': 1, 'CURRENT SMOKER': 2, 'PASSIVE SMOKER': 3},
    'CIGARETTE/BIDI/GANJA': {'CIGARETTE': 0, 'BIDI': 1, 'GANJA': 2, 'BIOMASS FUEL': 3, 'NONE': 4},
    'ALCOHOL USE': {'NO': 0, 'YES': 1},
}

In [40]:
def map_categorical_values(data, mapping_dict):
    mapped_data = data.copy()
    for column, mapping in mapping_dict.items():
        mapped_data[column] = mapped_data[column].replace(mapping)
    return mapped_data

In [41]:
data = map_categorical_values(data, categorical_mapping)

In [42]:
# data

Unnamed: 0,SHORTNESS OF BREATH,EXPECTORATION,DIABETES,TYPE OF SMOKER,CIGARETTE/BIDI/GANJA,ALCOHOL USE,mMRC GRADE,(FEV1 PRE BD) %PRED,(FEV1/FVC POST BD ) L/SEC,Gold Grade
0,1,1,0,1,1,0,3,27.00,55.00,3 SEVERE
1,1,1,1,1,1,1,3,54.00,61.58,3 SEVERE
2,1,1,0,1,1,1,2,56.00,61.80,2 MODERATE
3,1,1,0,1,1,0,2,1.23,61.00,2 MODERATE
4,1,1,1,1,3,0,3,26.00,49.00,4 VERY SEVERE
...,...,...,...,...,...,...,...,...,...,...
95,1,1,0,0,4,0,3,39.00,44.00,3 SEVERE
96,1,1,0,2,1,0,3,15.00,37.00,3 SEVERE
97,1,1,1,1,1,0,3,53.00,61.00,2 MODERATE
98,1,1,0,2,1,0,4,38.00,49.00,2 MODERATE


In [45]:
# Define X and y
X = data.drop(['Gold Grade'], axis=1)
y = data['Gold Grade']

In [46]:
# Map target values to numerical values
target_mapping = {
    '1 MILD': 1,
    '2 MODERATE': 2,
    '3 SEVERE': 3,
    '4 VERY SEVERE': 4
}
y_encoded = y.map(target_mapping)
y_encoded

0     3
1     3
2     2
3     2
4     4
     ..
95    3
96    3
97    2
98    2
99    2
Name: Gold Grade, Length: 100, dtype: int64

In [47]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [48]:
# X_test

Unnamed: 0,SHORTNESS OF BREATH,EXPECTORATION,DIABETES,TYPE OF SMOKER,CIGARETTE/BIDI/GANJA,ALCOHOL USE,mMRC GRADE,(FEV1 PRE BD) %PRED,(FEV1/FVC POST BD ) L/SEC
83,1,1,0,2,1,0,2,57.0,59.41
53,1,1,0,2,1,0,2,48.0,54.2
70,1,1,0,2,1,0,2,48.0,53.52
45,1,1,0,1,1,0,3,23.0,34.0
44,1,1,0,1,1,0,4,29.0,34.0
39,1,1,1,0,4,0,4,24.0,46.0
22,1,1,0,1,2,0,2,23.0,46.0
80,1,0,0,1,1,0,1,67.0,60.8
10,1,1,0,2,1,0,2,18.0,56.0
0,1,1,0,1,1,0,3,27.0,55.0


In [49]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Approach 1 - SVM

In [50]:
# Initialize and train an SVM model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_scaled, y_train)

In [51]:
# Make predictions
y_pred_level = svm_model.predict(X_test_scaled)

In [52]:
y_pred_level

array([2, 2, 2, 4, 4, 3, 3, 2, 3, 3, 4, 2, 3, 4, 2, 3, 3, 4, 2, 3],
      dtype=int64)

In [53]:
# Print classification report and accuracy score
print("Level Classification Report:")
print(classification_report(y_test, y_pred_level))
print("Level Accuracy:", accuracy_score(y_test, y_pred_level))

Level Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.57      0.80      0.67         5
           3       0.75      0.67      0.71         9
           4       0.80      0.80      0.80         5

    accuracy                           0.70        20
   macro avg       0.53      0.57      0.54        20
weighted avg       0.68      0.70      0.68        20

Level Accuracy: 0.7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Approach 2 - Random Forests

In [54]:
# Initialize and train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=43)
rf_model.fit(X_train_scaled, y_train)

In [55]:
# Make predictions with Random Forest
y_pred_rf = rf_model.predict(X_test_scaled)

In [56]:
# Print Random Forest classification report and accuracy score
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.50      0.80      0.62         5
           3       0.56      0.56      0.56         9
           4       0.67      0.40      0.50         5

    accuracy                           0.55        20
   macro avg       0.43      0.44      0.42        20
weighted avg       0.54      0.55      0.53        20

Random Forest Accuracy: 0.55


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Extract Model

In [57]:
import pickle

pickle_out = open('svm_classifier.pkl',"wb")
pickle.dump(svm_model,pickle_out)
pickle_out.close()

In [58]:
with open('svm_classifier.pkl', 'rb') as svm_file:
    svm_model = pickle.load(svm_file)

In [59]:
input_array = ['YES', 'YES', 'YES', 'CURRENT SMOKER', 'BIDI', 'YES', 2, 7.0, 3.0]

def array_to_dataframe(input_array):
    data_dict = dict(zip(['SHORTNESS OF BREATH', 'EXPECTORATION', 'DIABETES', 'TYPE OF SMOKER',
                'CIGARETTE/BIDI/GANJA', 'ALCOHOL USE', 'mMRC GRADE',
                '(FEV1 PRE BD) %PRED', '(FEV1/FVC POST BD ) L/SEC'], input_array))
    data_df = pd.DataFrame(data_dict, index=[0])
    return data_df

input_df = array_to_dataframe(input_array)

In [60]:
input_df = map_categorical_values(input_df, categorical_mapping)

In [61]:
input_df

Unnamed: 0,SHORTNESS OF BREATH,EXPECTORATION,DIABETES,TYPE OF SMOKER,CIGARETTE/BIDI/GANJA,ALCOHOL USE,mMRC GRADE,(FEV1 PRE BD) %PRED,(FEV1/FVC POST BD ) L/SEC
0,1,1,1,2,1,1,2,7.0,3.0


In [64]:
input_df_scaled = scaler.transform(input_df)
predicted_grade = svm_model.predict(input_df_scaled)

print("Predicted Gold Grade:", predicted_grade)

Predicted Gold Grade: [4]


In [65]:
# Define categorical and numerical features
categorical_features = ['SHORTNESS OF BREATH', 'EXPECTORATION', 'DIABETES', 'TYPE OF SMOKER',
                        'CIGARETTE/BIDI/GANJA', 'ALCOHOL USE']
numerical_features = ['mMRC GRADE', '(FEV1 PRE BD) %PRED', '(FEV1/FVC POST BD ) L/SEC']


In [67]:
# Save the scaler to a file
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [None]:
# Apply the saved LabelEncoder objects to the input DataFrame
for feature in label_encoders:
    label_encoder = label_encoders[feature]
    input_df[feature] = label_encoder.transform(input_df[feature])

In [None]:
input_array_encoded

In [None]:
# Standardize the input array using the same scaler used for training
scaler = StandardScaler()
input_array_scaled = scaler.fit_transform([input_array_encoded])

In [None]:
# Make predictions
predicted_grade = svm_model.predict(input_array_scaled)

# Print the predicted grade
print("Predicted Gold Grade:", predicted_grade)