In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
!pip install --upgrade gspread



In [None]:
from google.colab import auth
auth.authenticate_user() # This will prompt you to authenticate

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

# Replace with your spreadsheet and sheet names
worksheet = gc.open('cancer patient data sets').worksheet('Sheet')
data = worksheet.get_all_values()

In [None]:
df = pd.DataFrame.from_records(data)
print(df)

              0    1       2              3            4             5   \
0     Patient Id  Age  Gender  Air Pollution  Alcohol use  Dust Allergy   
1             P1   33       1              2            4             5   
2            P10   17       1              3            1             5   
3           P100   35       1              4            5             6   
4          P1000   37       1              7            7             7   
...          ...  ...     ...            ...          ...           ...   
996         P995   44       1              6            7             7   
997         P996   37       2              6            8             7   
998         P997   25       2              4            5             6   
999         P998   18       2              6            8             7   
1000        P999   47       1              6            5             6   

                        6             7                     8              9   \
0     OccuPational

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
1,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
2,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
3,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
4,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High


**DATA PREPROCESSING**

In [None]:
# Assuming 'data' holds the data you fetched from the Google Sheet
header_row = data[0]  # Extract the first row (header)
remaining_data = data[1:]  # Extract the remaining rows (data)

df = pd.DataFrame(remaining_data, columns=header_row)  # Create DataFrame with headers

In [None]:
df.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   object
 2   Gender                    1000 non-null   object
 3   Air Pollution             1000 non-null   object
 4   Alcohol use               1000 non-null   object
 5   Dust Allergy              1000 non-null   object
 6   OccuPational Hazards      1000 non-null   object
 7   Genetic Risk              1000 non-null   object
 8   chronic Lung Disease      1000 non-null   object
 9   Balanced Diet             1000 non-null   object
 10  Obesity                   1000 non-null   object
 11  Smoking                   1000 non-null   object
 12  Passive Smoker            1000 non-null   object
 13  Chest Pain                1000 non-null   object
 14  Coughing of Blood        

In [None]:
df['Age'] = df['Age'].str.strip() #Remove leading/trailing spaces

In [None]:
df['Age'] = df['Age'].str.replace(',', '', regex=False)  # Replace commas with empty string

In [None]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce').astype('Int64')

In [None]:
df['Gender'] = pd.to_numeric(df['Gender'], errors='coerce').astype('Int64')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   Int64 
 2   Gender                    1000 non-null   Int64 
 3   Air Pollution             1000 non-null   object
 4   Alcohol use               1000 non-null   object
 5   Dust Allergy              1000 non-null   object
 6   OccuPational Hazards      1000 non-null   object
 7   Genetic Risk              1000 non-null   object
 8   chronic Lung Disease      1000 non-null   object
 9   Balanced Diet             1000 non-null   object
 10  Obesity                   1000 non-null   object
 11  Smoking                   1000 non-null   object
 12  Passive Smoker            1000 non-null   object
 13  Chest Pain                1000 non-null   object
 14  Coughing of Blood        

In [None]:
df.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


**CHANGING DATATYPES**

In [None]:
columns_to_convert = df.columns[1:24]

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')

# Now, columns 1 to 23 should have a dtype of 'Int64'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   Int64 
 2   Gender                    1000 non-null   Int64 
 3   Air Pollution             1000 non-null   Int64 
 4   Alcohol use               1000 non-null   Int64 
 5   Dust Allergy              1000 non-null   Int64 
 6   OccuPational Hazards      1000 non-null   Int64 
 7   Genetic Risk              1000 non-null   Int64 
 8   chronic Lung Disease      1000 non-null   Int64 
 9   Balanced Diet             1000 non-null   Int64 
 10  Obesity                   1000 non-null   Int64 
 11  Smoking                   1000 non-null   Int64 
 12  Passive Smoker            1000 non-null   Int64 
 13  Chest Pain                1000 non-null   Int64 
 14  Coughing of Blood        

MISSING VALUES


In [None]:
df.isnull().sum()

Unnamed: 0,0
Patient Id,0
Age,0
Gender,0
Air Pollution,0
Alcohol use,0
Dust Allergy,0
OccuPational Hazards,0
Genetic Risk,0
chronic Lung Disease,0
Balanced Diet,0


**Drop non-essential columns**

In [None]:
df = df.drop(columns=['Patient Id'], errors='ignore')

In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


**ENCODING THE TARGET VARIABLE**

In [None]:
label_encoder = LabelEncoder()
df['Level'] = label_encoder.fit_transform(df['Level'])

In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,0
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,0


**SPLITTING THE DATA**

In [None]:
x = df.drop(columns=['Level'])
y = df['Level']
x.shape, y.shape

((1000, 23), (1000,))

**SCALING THE FEATURES**

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,0
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,0


**SPLITTING THE DATA INTO TRAIN AND TEST**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

**IMPLEMENTING KNN CLASSIFIER**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

**MAKING PREDICTIONS**

In [None]:
y_pred = knn.predict(X_test)

**EVALUATING THE MODEL**

In [None]:
from sklearn.metrics import f1_score
accuracy = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        82
         Low       1.00      1.00      1.00        55
      Medium       1.00      1.00      1.00        63

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



**EXPLANATION**

**Approach:**

1. **Data Loading and Preprocessing**: The
code starts by loading a dataset from a
Google Sheet. Then, it performs data cleaning and preprocessing steps, such as handling missing values, converting data types, and encoding the target variable using Label Encoding.

2. **Feature Scaling:** It applies StandardScaler to scale the features, ensuring that they have zero mean and unit variance. This helps to improve the performance of the KNN classifier.

3. **Data Splitting:** The dataset is split into training and testing sets using train_test_split. This is done to evaluate the model's performance on unseen data.

4. **Model Training:** A K-Nearest Neighbors (KNN) classifier is used for the classification task. The model is trained using the training data.

5. **Model Evaluation:** After training, the model's performance is evaluated using metrics such as accuracy and classification report. These metrics provide insights into how well the model is performing on the test data.




**Results:**

The code evaluates the model using the F1-score (weighted average) and generates a classification report.

1. **Accuracy:** The F1-score indicates the overall
accuracy of the model in classifying the target variable. To see the accuracy score, execute the code.

2. **Classification Report:** The classification report provides detailed information about the model's performance for each class, including precision, recall, and F1-score. To see the classification report, execute the code.

In essence, the code implements a machine learning pipeline for classification using the KNN algorithm. It preprocesses the data, trains a model, and evaluates its performance to provide insights into its effectiveness in predicting the target variable.

**Disease Risk**

In [None]:
import pandas as pd
import numpy as np

def predict_disease_risk_for_features(patient_data, feature_names):
    """Predicts disease risk for specified features.

    Args:
        patient_data: A dictionary or pandas Series containing patient data.
        feature_names: A list of feature names to predict risk for.

    Returns:
        A dictionary containing predicted risk levels for each specified feature.
    """

    risk_predictions = {}
    for feature_name in feature_names:
        try:
            # Extract the feature value from patient_data
            feature_value = patient_data[feature_name]

            # Create a sample data point with the specified feature value
            sample_data = x.iloc[0].copy()  # Copy the first row of x
            sample_data[feature_name] = feature_value

            # Scale the sample data
            scaled_sample_data = scaler.transform(np.array([sample_data]))

            # Predict the risk level
            prediction = knn.predict(scaled_sample_data)[0]

            # Get the risk level interpretation
            risk_levels = {
                0: "Low Risk",
                1: "Medium Risk",
                2: "High Risk"
            }
            risk_level = risk_levels.get(prediction, "Unknown Risk")

            risk_predictions[feature_name] = risk_level

        except KeyError:
            print(f"Feature '{feature_name}' not found in patient data.")

    return risk_predictions

# Example usage:
patient_data = {
    'Age': 35,
    'Alcohol use': 2,
    'Dust Allergy': 1,
    'Genetic Risk': 0,
    'Balanced Diet': 1
}  # Replace with your patient's data

feature_names = ['Age', 'Alcohol use', 'Dust Allergy', 'Genetic Risk', 'Balanced Diet']

risk_predictions = predict_disease_risk_for_features(patient_data, feature_names)
print(risk_predictions)

{'Age': 'Medium Risk', 'Alcohol use': 'Medium Risk', 'Dust Allergy': 'Medium Risk', 'Genetic Risk': 'Medium Risk', 'Balanced Diet': 'Medium Risk'}


