**Importing Libraries**

In [3]:
#Importing the Libraries
#for data manipulation, analysis, and handling. 
import numpy as np
import pandas as pd

#for machine learning and model evaluation
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


#for data visualization
import matplotlib as plt
import seaborn as sns

**Data Collection and Analysis**

In [4]:
# Loading the data
lung_cancer_data = pd.read_excel('lung_cancer.xlsx') 


In [5]:
# Displaying the first 5 rows of the dataset
print(lung_cancer_data.head())

  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC_DISEASE  FATIGUE  ALLERGY  WHEEZING  ALCOHOL_CONSUMING  COUGHING  \
0                1        2        1         2                  2         2   
1                2        2        2         1                  1         1   
2                1        2        1         2                  1         2   
3                1        1        1         1                  2         1   
4                1        1        1         2                  1         2   

   SHORTNESS_OF_BREATH  SWALLOWING_DIFFICULTY  CHEST_PAIN LUNG_CANCER  
0                    2                      2           

In [6]:
# Displaying information about the dataset
print(lung_cancer_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC_DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL_CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS_OF_BREATH    309 non-null    int64 
 13  SWALLOWING_DIFFICULTY  309 non-null    int64 
 14  CHEST_PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [7]:
# Displaying the number of missing values in each column
print(lung_cancer_data.isnull().sum())

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC_DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL_CONSUMING        0
COUGHING                 0
SHORTNESS_OF_BREATH      0
SWALLOWING_DIFFICULTY    0
CHEST_PAIN               0
LUNG_CANCER              0
dtype: int64


In [8]:
# Displaying some statistical measures about the data
print(lung_cancer_data.describe())

              AGE     SMOKING  YELLOW_FINGERS     ANXIETY  PEER_PRESSURE  \
count  309.000000  309.000000      309.000000  309.000000     309.000000   
mean    62.673139    1.563107        1.569579    1.498382       1.501618   
std      8.210301    0.496806        0.495938    0.500808       0.500808   
min     21.000000    1.000000        1.000000    1.000000       1.000000   
25%     57.000000    1.000000        1.000000    1.000000       1.000000   
50%     62.000000    2.000000        2.000000    1.000000       2.000000   
75%     69.000000    2.000000        2.000000    2.000000       2.000000   
max     87.000000    2.000000        2.000000    2.000000       2.000000   

       CHRONIC_DISEASE     FATIGUE     ALLERGY    WHEEZING  ALCOHOL_CONSUMING  \
count       309.000000  309.000000  309.000000  309.000000         309.000000   
mean          1.504854    1.673139    1.556634    1.556634           1.556634   
std           0.500787    0.469827    0.497588    0.497588           0.4

In [9]:
# Displaying the distribution of the target variable
print(lung_cancer_data['LUNG_CANCER'].value_counts())

LUNG_CANCER
YES    270
NO      39
Name: count, dtype: int64


In [10]:
# Data Pre-Processing
# Separating the features & Target
X = lung_cancer_data.drop(columns=['GENDER', 'LUNG_CANCER'], axis=1)
Y = lung_cancer_data['LUNG_CANCER']


In [11]:
# Displaying the features
print(X)

     AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  CHRONIC_DISEASE  \
0     69        1               2        2              1                1   
1     74        2               1        1              1                2   
2     59        1               1        1              2                1   
3     63        2               2        2              1                1   
4     63        1               2        1              1                1   
..   ...      ...             ...      ...            ...              ...   
304   56        1               1        1              2                2   
305   70        2               1        1              1                1   
306   58        2               1        1              1                1   
307   67        2               1        2              1                1   
308   62        1               1        1              2                1   

     FATIGUE  ALLERGY  WHEEZING  ALCOHOL_CONSUMING  COUGHING  \

In [12]:
# Displaying the target variable
print(Y)

0      YES
1      YES
2       NO
3       NO
4       NO
      ... 
304    YES
305    YES
306    YES
307    YES
308    YES
Name: LUNG_CANCER, Length: 309, dtype: object


**Data Splitting**

In [13]:
# Splitting the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

**Model Training**

In [14]:
# Model Training
# Support Vector Machine Model
lung_cancer_model = svm.SVC(kernel='linear')

In [15]:
# Training the SVM model with training data
lung_cancer_model.fit(X_train, Y_train)

**Model Evaluation**

In [16]:
# Model Evaluation
# Accuracy Score on training data
X_train_prediction = lung_cancer_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score on training data:', training_data_accuracy)

Accuracy score on training data: 0.951417004048583


In [17]:

# Accuracy Score on test data
X_test_prediction = lung_cancer_model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score on test data:', test_data_accuracy)


Accuracy score on test data: 0.8870967741935484


In [18]:
# Precision on test data
precision = precision_score(Y_test, X_test_prediction, pos_label='YES')
print('Precision on test data:', precision)

# Recall on test data
recall = recall_score(Y_test, X_test_prediction, pos_label='YES')
print('Recall on test data:', recall)

# F1-score on test data
f1 = f1_score(Y_test, X_test_prediction, pos_label='YES')
print('F1 score on test data:', f1)

# Confusion Matrix on test data
conf_matrix = confusion_matrix(Y_test, X_test_prediction)
print('Confusion Matrix on test data:\n', conf_matrix)


Precision on test data: 0.896551724137931
Recall on test data: 0.9811320754716981
F1 score on test data: 0.9369369369369369
Confusion Matrix on test data:
 [[ 3  6]
 [ 1 52]]


**Building a Predictive system**

In [19]:
# Building a Predictive System
# Providing input data for prediction
input_data = (48,1,2,2,2,2,2,2,2,1,2,2,2,1)

# Converting input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshaping the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Making predictions
prediction = lung_cancer_model.predict(input_data_reshaped)
print(prediction)

# Displaying the result
if prediction[0] == 'NO':
    print("The person does not have lung cancer.")
else:
    print("The person has lung cancer.")

['YES']
The person has lung cancer.




**Saving the model**

In [20]:
import pickle
filename = 'lung_cancer_model.sav'
pickle.dump(lung_cancer_model, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('lung_cancer_model.sav', 'rb'))

In [21]:
#printing the names of the columns
for column in X.columns:
  print(column)

AGE
SMOKING
YELLOW_FINGERS
ANXIETY
PEER_PRESSURE
CHRONIC_DISEASE
FATIGUE
ALLERGY
WHEEZING
ALCOHOL_CONSUMING
COUGHING
SHORTNESS_OF_BREATH
SWALLOWING_DIFFICULTY
CHEST_PAIN
