In [2]:
import pandas as pd
import numpy as np

In [3]:
#Data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [4]:
dataset = pd.read_csv('/content/survey lung cancer.csv')

In [5]:
dataset.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [6]:
dataset.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [7]:
encoder = LabelEncoder()
dataset['GENDER'] = encoder.fit_transform(dataset['GENDER'])
dataset['LUNG_CANCER'] = encoder.fit_transform(dataset['LUNG_CANCER'])

In [8]:
#split the dataset
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [9]:
dataset.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [10]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [12]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
# Initialize models
svm_model = SVC(kernel='poly',probability=True)
naive_bayes_model = GaussianNB()
knn_model = KNeighborsClassifier()
logreg_model = LogisticRegression()

In [14]:
# Train models
svm_model.fit(X_train, y_train)

In [15]:
naive_bayes_model.fit(X_train, y_train)

In [16]:
knn_model.fit(X_train, y_train)

In [17]:
logreg_model.fit(X_train, y_train)

In [18]:
#Model Evaluation
from sklearn.metrics import accuracy_score, classification_report,precision_score,recall_score

In [19]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    precision = precision_score(y_test, y_pred,pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    return accuracy, report,precision,recall

In [20]:
svm_accuracy, svm_report,svm_p,svm_r = evaluate_model(svm_model, X_test, y_test)
print("SVM Accuracy:", svm_accuracy)

SVM Accuracy: 0.978494623655914


In [21]:
naive_bayes_accuracy, naive_bayes_report,nvp,nvr= evaluate_model(naive_bayes_model, X_test, y_test)
print("Naive Bayes Accuracy:", naive_bayes_accuracy)

Naive Bayes Accuracy: 0.956989247311828


In [22]:
knn_accuracy, knn_report,knn_p,knn_r = evaluate_model(knn_model, X_test, y_test)
print("KNN Accuracy:", knn_accuracy)

KNN Accuracy: 0.9354838709677419


In [23]:
logreg_accuracy, logreg_report,log_reg_p,lgr= evaluate_model(logreg_model, X_test, y_test)
print("logistic regression Accuracy:", logreg_accuracy)

logistic regression Accuracy: 0.978494623655914


In [24]:
accuracy=[svm_accuracy,naive_bayes_accuracy,knn_accuracy,logreg_accuracy]
print(accuracy)

[0.978494623655914, 0.956989247311828, 0.9354838709677419, 0.978494623655914]


In [25]:
precision=[svm_p,nvp,knn_p,log_reg_p]
print(precision)

[0.9883720930232558, 0.9659090909090909, 0.9651162790697675, 0.9772727272727273]


In [26]:
recall=[svm_r,nvr,knn_r,lgr]
print(recall)

[0.9883720930232558, 0.9883720930232558, 0.9651162790697675, 1.0]


In [27]:
best_model = max(svm_accuracy, naive_bayes_accuracy, knn_accuracy, logreg_accuracy)

if best_model == svm_accuracy:
    selected_model = svm_model
elif best_model == naive_bayes_accuracy:
    selected_model = naive_bayes_model
elif best_model == knn_accuracy:
    selected_model = knn_model
else:
    selected_model = logreg_model

In [28]:
print("Selected Model:", selected_model)

Selected Model: SVC(kernel='poly', probability=True)


In [29]:
print("Classification Report:\n", classification_report(y_test, selected_model.predict(X_test)))

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.99      0.99      0.99        86

    accuracy                           0.98        93
   macro avg       0.92      0.92      0.92        93
weighted avg       0.98      0.98      0.98        93



In [30]:
#predicting the user input data
input_data = (1,69,1,2,2,1,1,2,1,2,2,2,2,2,2)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = selected_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not having lung cancer')
else:
  print('The person is having lung cancer')

[1]
The person is having lung cancer


**saving the trained model**

In [31]:
import pickle

In [32]:
filename="trainedmodel2.sav"
pickle.dump(selected_model,open(filename,'wb'))

In [33]:
#loading the saved model
loaded_model=pickle.load(open('trainedmodel2.sav','rb'))

In [34]:
#predicting the user input data by using loaded model
input_data = (1,69,1,2,2,1,1,2,1,2,2,2,2,2,2)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not having lung cancer')
else:
  print('The person is having lung cancer')

[1]
The person is having lung cancer


In [35]:
#ensemble learning
#from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

In [36]:
# ensemble = VotingClassifier(
#     estimators=[
#         ('svm', svm_model),
#         ('knn', knn_model),
#         ('nb', naive_bayes_model),
#         ('lr', logreg_model)
#     ],
#     voting='soft'  # Use soft voting for weighted averaging of probabilities
# )
random_forest_ensemble = RandomForestClassifier(
    n_estimators=100,      # Number of decision trees in the ensemble
    random_state=42
)

In [37]:
# Train the ensemble on the training data
random_forest_ensemble.fit(X_train, y_train)


In [38]:
# Make predictions on the test data
y_pred = random_forest_ensemble.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Accuracy: {accuracy:.2f}')

Ensemble Accuracy: 0.98
