## Data Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df=pd.read_csv("data.csv")

In [5]:
df.head(6)

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
5,5,P102,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     1000 non-null   int64 
 1   Patient Id                1000 non-null   object
 2   Age                       1000 non-null   int64 
 3   Gender                    1000 non-null   int64 
 4   Air Pollution             1000 non-null   int64 
 5   Alcohol use               1000 non-null   int64 
 6   Dust Allergy              1000 non-null   int64 
 7   OccuPational Hazards      1000 non-null   int64 
 8   Genetic Risk              1000 non-null   int64 
 9   chronic Lung Disease      1000 non-null   int64 
 10  Balanced Diet             1000 non-null   int64 
 11  Obesity                   1000 non-null   int64 
 12  Smoking                   1000 non-null   int64 
 13  Passive Smoker            1000 non-null   int64 
 14  Chest Pain               

In [7]:
df.columns

Index(['index', 'Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking',
       'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue',
       'Weight Loss', 'Shortness of Breath', 'Wheezing',
       'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold',
       'Dry Cough', 'Snoring', 'Level'],
      dtype='object')

In [8]:
df.shape

(1000, 26)

In [8]:
df.isnull().sum()

index                       0
Patient Id                  0
Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swallowing Difficulty       0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64

## Dropping data fields which are not required

In [9]:
df.drop(df.columns[[0, 1, 7,9,13,15,19,20,21,24]], axis=1, inplace=True)

In [10]:
df.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Genetic Risk,Balanced Diet,Obesity,Smoking,Chest Pain,Fatigue,Weight Loss,Shortness of Breath,Frequent Cold,Dry Cough,Level
0,33,1,2,4,5,3,2,4,3,2,3,4,2,2,3,Low
1,17,1,3,1,5,4,2,2,2,2,1,3,7,1,7,Medium
2,35,1,4,5,6,5,6,7,2,4,8,7,9,6,7,High
3,37,1,7,7,7,6,7,7,7,7,4,2,3,6,7,High
4,46,1,6,8,7,7,7,7,8,7,3,2,4,4,2,High
5,35,1,4,5,6,5,6,7,2,4,8,7,9,6,7,High
6,52,2,2,4,5,3,2,4,3,2,3,4,2,2,3,Low
7,28,2,3,1,4,2,4,3,1,3,3,2,2,3,4,Low
8,35,2,4,5,6,6,5,5,6,6,1,4,3,2,4,Medium
9,46,1,2,3,4,4,3,3,2,4,1,2,4,2,1,Medium


In [11]:
df['Gender'].value_counts()

1    598
2    402
Name: Gender, dtype: int64

In [12]:
df.isnull().sum()

Age                    0
Gender                 0
Air Pollution          0
Alcohol use            0
Dust Allergy           0
Genetic Risk           0
Balanced Diet          0
Obesity                0
Smoking                0
Chest Pain             0
Fatigue                0
Weight Loss            0
Shortness of Breath    0
Frequent Cold          0
Dry Cough              0
Level                  0
dtype: int64

## Dealing with Level Column

In [13]:
df['Level'].value_counts()

High      365
Medium    332
Low       303
Name: Level, dtype: int64

In [14]:
df['Level'] = df['Level'].map({'High':1 ,'Low':2})

### Fixing some minor personal errors

In [15]:
df['Level'] = df['Level'].replace([2], 0)

In [16]:
df = df.replace(np.nan, 0)

In [29]:
df.dtypes

Age                    int64
Gender                 int64
Air Pollution          int64
Alcohol use            int64
Dust Allergy           int64
Genetic Risk           int64
Balanced Diet          int64
Obesity                int64
Smoking                int64
Chest Pain             int64
Fatigue                int64
Weight Loss            int64
Shortness of Breath    int64
Frequent Cold          int64
Dry Cough              int64
Level                  int64
dtype: object

In [17]:
df['Level'] = df['Level'].apply(np.int64)

## Final Cleaned Data

In [18]:
df.head(6)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Genetic Risk,Balanced Diet,Obesity,Smoking,Chest Pain,Fatigue,Weight Loss,Shortness of Breath,Frequent Cold,Dry Cough,Level
0,33,1,2,4,5,3,2,4,3,2,3,4,2,2,3,0
1,17,1,3,1,5,4,2,2,2,2,1,3,7,1,7,0
2,35,1,4,5,6,5,6,7,2,4,8,7,9,6,7,1
3,37,1,7,7,7,6,7,7,7,7,4,2,3,6,7,1
4,46,1,6,8,7,7,7,7,8,7,3,2,4,4,2,1
5,35,1,4,5,6,5,6,7,2,4,8,7,9,6,7,1


## Setting Up scikit learn

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [19]:
X = df.drop(columns='Level', axis =1)
Y = df['Level']

In [21]:
# training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(1000, 15) (800, 15) (200, 15)


## Building Model

In [23]:
model = LogisticRegression()

In [46]:
model.fit(X_train, Y_train)
     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [49]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy*100)

Accuracy score of the training data :  100.0


In [50]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy*100)

Accuracy score of the test data :  100.0


In [52]:

input_data = (33,1,2,4,5,3,2,4,3,2,3,4,2,2,3)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not LunG Cancer')
else:
  print('The Person has Lung Cancer Disease')

[0]
The Person does not LunG Cancer




In [51]:
df

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Genetic Risk,Balanced Diet,Obesity,Smoking,Chest Pain,Fatigue,Weight Loss,Shortness of Breath,Frequent Cold,Dry Cough,Level
0,33,1,2,4,5,3,2,4,3,2,3,4,2,2,3,0
1,17,1,3,1,5,4,2,2,2,2,1,3,7,1,7,0
2,35,1,4,5,6,5,6,7,2,4,8,7,9,6,7,1
3,37,1,7,7,7,6,7,7,7,7,4,2,3,6,7,1
4,46,1,6,8,7,7,7,7,8,7,3,2,4,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,7,7,7,5,3,2,4,5,1
996,37,2,6,8,7,7,7,7,7,7,9,6,5,3,1,1
997,25,2,4,5,6,5,6,7,2,4,8,7,9,6,7,1
998,18,2,6,8,7,7,7,7,8,7,3,2,4,4,2,1
