In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

# Replace 'Your spreadsheet name' with the actual name of your spreadsheet
# Replace 'Sheet1' with the name of the sheet you want to access
worksheet = gc.open('gpt-4').worksheet('Sheet')

# Get all values from the worksheet
data = worksheet.get_all_values()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import make_classification, make_regression


In [None]:
# Convert the data to a Pandas DataFrame
import pandas as pd
df = pd.DataFrame(data[1:], columns=data[0])  # Assuming the first row is the header

# Now you can work with the DataFrame 'df'
data = worksheet.get_all_values()

# Now you can work with the DataFrame 'df'
df
df.drop("Patient Id", axis=1, inplace=True)

# Cleaning Column Names
df.rename(columns=str.lower, inplace=True)
df.rename(columns={col: col.replace(" ", "_") for col in df.columns}, inplace=True)

# Display Data After Cleaning
display(df)

Unnamed: 0,age,gender,air_pollution,alcohol_use,dust_allergy,occupational_hazards,genetic_risk,chronic_lung_disease,balanced_diet,obesity,...,fatigue,weight_loss,shortness_of_breath,wheezing,swallowing_difficulty,clubbing_of_finger_nails,frequent_cold,dry_cough,snoring,level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,5,3,2,7,8,2,4,5,3,High
996,37,2,6,8,7,7,7,6,7,7,...,9,6,5,7,2,4,3,1,4,High
997,25,2,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
998,18,2,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [None]:
# Convert relevant columns to numeric, handling errors
for col in df.columns:
    if col not in ['level']:  # Exclude non-numeric columns
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce') #'coerce' set invalid parsing to NaN
        except:
            print(f"Could not convert column '{col}' to numeric.")

# Display info to confirm data type changes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   age                       1000 non-null   int64 
 1   gender                    1000 non-null   int64 
 2   air_pollution             1000 non-null   int64 
 3   alcohol_use               1000 non-null   int64 
 4   dust_allergy              1000 non-null   int64 
 5   occupational_hazards      1000 non-null   int64 
 6   genetic_risk              1000 non-null   int64 
 7   chronic_lung_disease      1000 non-null   int64 
 8   balanced_diet             1000 non-null   int64 
 9   obesity                   1000 non-null   int64 
 10  smoking                   1000 non-null   int64 
 11  passive_smoker            1000 non-null   int64 
 12  chest_pain                1000 non-null   int64 
 13  coughing_of_blood         1000 non-null   int64 
 14  fatigue                  

In [None]:
#data processing

In [None]:
#spare copy
# Assuming 'data' holds the data you fetched from the Google Sheet
header_row = data[0]  # Extract the first row (header)
remaining_data = data[1:]  # Extract the remaining rows (data)

df = pd.DataFrame(remaining_data, columns=header_row)  # Create DataFrame with headers

In [None]:
df.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [None]:
df['Age'] = df['Age'].str.strip() #Remove leading/trailing spaces

In [None]:
df['Age'] = df['Age'].str.replace(',', '', regex=False)  # Replace commas with empty string

In [None]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce').astype('Int64')

In [None]:
df['Gender'] = pd.to_numeric(df['Gender'], errors='coerce').astype('Int64')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   Int64 
 2   Gender                    1000 non-null   Int64 
 3   Air Pollution             1000 non-null   object
 4   Alcohol use               1000 non-null   object
 5   Dust Allergy              1000 non-null   object
 6   OccuPational Hazards      1000 non-null   object
 7   Genetic Risk              1000 non-null   object
 8   chronic Lung Disease      1000 non-null   object
 9   Balanced Diet             1000 non-null   object
 10  Obesity                   1000 non-null   object
 11  Smoking                   1000 non-null   object
 12  Passive Smoker            1000 non-null   object
 13  Chest Pain                1000 non-null   object
 14  Coughing of Blood        

In [None]:
df.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [None]:
#changing datatype

In [None]:
columns_to_convert = df.columns[1:24]

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')

# Now, columns 1 to 23 should have a dtype of 'Int64'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   Int64 
 2   Gender                    1000 non-null   Int64 
 3   Air Pollution             1000 non-null   Int64 
 4   Alcohol use               1000 non-null   Int64 
 5   Dust Allergy              1000 non-null   Int64 
 6   OccuPational Hazards      1000 non-null   Int64 
 7   Genetic Risk              1000 non-null   Int64 
 8   chronic Lung Disease      1000 non-null   Int64 
 9   Balanced Diet             1000 non-null   Int64 
 10  Obesity                   1000 non-null   Int64 
 11  Smoking                   1000 non-null   Int64 
 12  Passive Smoker            1000 non-null   Int64 
 13  Chest Pain                1000 non-null   Int64 
 14  Coughing of Blood        

In [None]:
#missing values

In [None]:
df.isnull().sum()

Unnamed: 0,0
Patient Id,0
Age,0
Gender,0
Air Pollution,0
Alcohol use,0
Dust Allergy,0
OccuPational Hazards,0
Genetic Risk,0
chronic Lung Disease,0
Balanced Diet,0


In [None]:
#droping non essential columns

In [None]:
df = df.drop(columns=['Patient Id'], errors='ignore')

In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [None]:
#encoding target variable

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()  # Create an instance of LabelEncoder
df['Level'] = label_encoder.fit_transform(df['Level'])  # Fit and transform the 'Level' column


In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,0
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,0


In [None]:
#splitting the data

In [None]:
x = df.drop(columns=['Level'])  # Create the feature set by dropping the 'Level' column feature matrix
y = df['Level']                # Extract the target variable (dependent variable) target vector
x.shape, y.shape               # Display the shapes of x and y


((1000, 23), (1000,))

In [None]:
#scaling the features


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()        # Create an instance of the StandardScaler
X_scaled = scaler.fit_transform(x)  # Fit the scaler to the data and transform it


In [None]:
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,0
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,0


In [None]:
#SPLITTING THE DATA INTO TRAIN AND TEST

In [None]:

from sklearn.model_selection import train_test_split



# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=55)

# Check shapes
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (700, 23)
X_test shape: (300, 23)
y_train shape: (700,)
y_test shape: (300,)


In [None]:
# Import the RandomForestClassifier from the sklearn library
# This is a machine learning model used for classification tasks by creating a 'forest' of decision trees.
from sklearn.ensemble import RandomForestClassifier

# Import evaluation metrics from sklearn
# These are used to measure the performance of the model, such as accuracy and detailed reports.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create the Random Forest Classifier model
# Initialize the RandomForestClassifier with a fixed random_state for reproducibility.
rf_model = RandomForestClassifier(random_state=42)
#rf_model = RandomForestClassifier(n_estimators=100,max_depth=none,random_state=42)  By default

# Fit the model to the training data
# This line trains the Random Forest model on the training dataset (features and target labels).
rf_model.fit(X_train, y_train)

# Predict the test set results
# This predicts the target values for the test dataset using the trained model.
y_pred = rf_model.predict(X_test)

# Evaluate the model
# This computes the accuracy of the predictions by comparing them to the actual test labels.
accuracy = accuracy_score(y_test, y_pred)

# This creates a confusion matrix, showing the counts of true positive, true negative, false positive, and false negative predictions.
conf_matrix = confusion_matrix(y_test, y_pred)

# This generates a detailed report including precision, recall, and F1-score for each class.
class_report = classification_report(y_test, y_pred)

# Output the evaluation metrics
# Indicate that this prints the overall accuracy of the model.
print("Accuracy of the Random Forest Classifier:", accuracy)
#prints the confusion matrix to help visualize the model's performance.
print("\nConfusion Matrix:\n", conf_matrix)
#outputs the detailed classification report for further analysis.
print("\nClassification Report:\n", class_report)

Accuracy of the Random Forest Classifier: 1.0

Confusion Matrix:
 [[ 96   0   0]
 [  0  97   0]
 [  0   0 107]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00        97
           2       1.00      1.00      1.00       107

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [None]:
#EXPLANATION

#Approach:

#Data Preprocessing: The dataset is preprocessed by encoding the target variable Level using LabelEncoder(). The features are scaled with StandardScaler() to normalize the data, ensuring better performance for the machine learning model.

#Feature Scaling: It applies StandardScaler to scale the features, ensuring that they have zero mean and unit variance.

#Data Splitting: The dataset is split into training and testing sets using train_test_split, which helps evaluate the model’s performance on unseen data.

#Model Training: A Random Forest Classifier is used for the classification task. The model is trained on the training data (X_train and y_train) and then used to make predictions on the test data (X_test).

#Model Evaluation: After training, the model's performance is evaluated using accuracy, confusion matrix, and a classification report, providing insights into how well the model is classifying the target variable.

#Results:

#Accuracy: The model achieved 1.0 accuracy, meaning it predicted all the test set values correctly, with no errors in classification.

#Confusion Matrix: The matrix shows that all predictions were correctly classified, with no false positives or false negatives. The rows correspond to actual labels, and the columns represent predicted labels, showing perfect alignment.

#Classification Report: The report indicates a precision, recall, and F1-score of 1.00 for all classes (0, 1, and 2), suggesting the model is highly accurate and balanced in predicting each class without bias.

#In essence, the approach involves training a Random Forest classifier, evaluating its performance with various metrics, and confirming its ability to predict the target variable with 100% accuracy. The model’s ability to handle complex relationships in the data ensures robust predictions across multiple classes.








In [None]:
#Random Forest Classifier:


#Random Forest is a versatile ensemble machine learning technique that enhances prediction accuracy by combining multiple decision trees.
#Each decision tree is trained on a random subset of the data, which helps the model learn various patterns and relationships within the dataset.
#The individual trees vote on the predicted class (for classification tasks), and the class with the majority of votes becomes the final prediction.
#For regression tasks, the predictions from all trees are averaged to producea final result.
#This ensemble approach reduces the risk of overfitting, which is common in single decision trees, and improves the model's ability to generalize to unseen data.

#ADVANTAGE

#Random Forest improves prediction accuracy by combining multiple decision trees, reducing overfitting, and handling noise effectively.
#It works well with complex datasets, identifying the most influential features, like age or health factors, to predict the "Level" variable.
#This makes it a reliable and insightful choice for health-related predictions.

#working(short note)
#A Random Forest model (RandomForestClassifier()) is created and trained using the training data (X_train and y_train).
#After training, the model is used to predict the values for the test set (X_test).
#the Random Forest algorithm is applied here to predict the "Level" (Cancer severity in patients)
#based on multiple factors, helping create a more accurate, reliable model through the combination of multiple decision trees.