In [1]:
# Step 1: Data Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/cancer patient data sets.csv")
data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [4]:
numeric_data = data.drop(['index', 'Patient Id', 'Level'], axis=1)

# Calculate minimum and maximum values for each column
min_values = numeric_data.min()
max_values = numeric_data.max()

# Display min and max values for each column
print("Minimum values:")
print(min_values)
print("\nMaximum values:")
print(max_values)

Minimum values:
Age                         14
Gender                       1
Air Pollution                1
Alcohol use                  1
Dust Allergy                 1
OccuPational Hazards         1
Genetic Risk                 1
chronic Lung Disease         1
Balanced Diet                1
Obesity                      1
Smoking                      1
Passive Smoker               1
Chest Pain                   1
Coughing of Blood            1
Fatigue                      1
Weight Loss                  1
Shortness of Breath          1
Wheezing                     1
Swallowing Difficulty        1
Clubbing of Finger Nails     1
Frequent Cold                1
Dry Cough                    1
Snoring                      1
dtype: int64

Maximum values:
Age                         73
Gender                       2
Air Pollution                8
Alcohol use                  8
Dust Allergy                 8
OccuPational Hazards         8
Genetic Risk                 7
chronic Lung Disease    

In [5]:
data.isna()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
data.fillna(method='ffill', inplace=True)
data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [7]:
label_encoder = LabelEncoder()
data['Level'] = label_encoder.fit_transform(data['Level'])
data['Level'].head()
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(class_mapping)

{'High': 0, 'Low': 1, 'Medium': 2}


In [8]:
X = data.drop(['index', 'Patient Id', 'Level'], axis=1)
y = data['Level']

In [9]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [10]:
# Initialize and train KNN model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [11]:
y_pred = knn.predict(X_test)



In [12]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.995

Confusion Matrix:
[[82  0  0]
 [ 0 54  1]
 [ 0  0 63]]


In [14]:
# Sample new data
new_data_point = [[30, 1, 3, 2, 3, 4, 2, 3, 2, 3, 2, 1, 3, 2, 4, 2, 3, 2, 3, 1, 2, 3, 4]]
predicted_level = knn.predict(new_data_point)
predicted_level_category = label_encoder.inverse_transform(predicted_level)

print("Predicted health risk level for the new data point:", predicted_level_category[0])


Predicted health risk level for the new data point: Low




In [15]:
# Step 3: Save Model
import joblib
joblib.dump(knn, 'knn_model.pkl')

['knn_model.pkl']

In [17]:
import joblib
loaded_model = joblib.load('knn_model.pkl')
new_data_point = [[30, 1, 3, 2, 3, 4, 2, 3, 2, 3, 2, 1, 3, 2, 4, 2, 3, 2, 3, 1, 2, 3, 4]]

# Use the loaded model to make predictions
predicted_level = loaded_model.predict(new_data_point)

# Convert the predicted encoded level back to its original category
predicted_level_category = label_encoder.inverse_transform(predicted_level)

print("Predicted health risk level for the new data point:", predicted_level_category[0])


Predicted health risk level for the new data point: Low


