In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **BREAST CANCER DETECTION**

Breast cancer is the most common cancer amongest women in the world. It starts when cells in the breast begin to grow abnormally. These cells usually from tumors that can be seen via X-ray or felt as lumps in the breast area.



# **DATA PREPARATION**

**Attribute Information:**

1)ID number 2) Diagnosis (M = malignant, B = benign) 3–32)

**Ten real-valued features are computed for each cell nucleus:** 

*   radius (mean of distances from center to points on the perimeter)
*   texture (standard deviation of gray-scale values)
*   perimeter
*   area
*   smoothness (local variation in radius lengths)
*   compactness (perimeter² / area — 1.0)
*   concavity (severity of concave portions of the contour)
*   concave points (number of concave portions of the contour)
*   symmetry
*   fractal dimension (“coastline approximation” — 1)

The **mean, standard error and “worst”** or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.

In [None]:
#Description : This program detects breast cancer,based off of data

In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **Loading the data**

In [None]:
#Load the data
data = "../input/breast-cancer-dataset/breast-cancer.csv"
df=pd.read_csv(data)
print(df)

In [None]:
#To display first five records
df.head()

In [None]:
#To display last five records
df.tail()

### **Describing the data**

In [None]:
#describe() method returns description of the data in the DataFrame.
df.describe()

In [None]:
#To display number of columns, column labels, column data types, memory usage, range index, and the number of cells in each column (non-null values).
df.info()

In [None]:
#To show top 30 rows and bottom 30 rows
df.info

In [None]:
type(df)

In [None]:
#To display number of rows and columns
df.shape

In [None]:
#To return data type of each column. 
df.dtypes

In [None]:
#To return the number of unique values for each column
df.nunique()

In [None]:
#To count the number of not empty values for each row
df.count()

# **DATA WRANGLING**

It is the process of converting or mapping data from one “**raw**” form into another
format to make it ready for further analysis.**Data pre-processing** is also often called “**data cleaning**” or “**data wrangling**”

In [None]:
#Count the number of empty values (NaN,NAN,na) in each column
df.isna().sum()

In [None]:
#Dropping unnecessary column 
df.drop("id",axis=1,inplace=True)

In [None]:
#Checking whether the column has been removed or not
df.head()

In [None]:
#Checking number of rows and columns again
df.shape

# **EXPLORATORY DATA ANALYSIS**

### **Classification of patients into malignant(cancerous) or benign(non cancerous) groups and checking size**

In [None]:
#To count the number of malignant(M) or benign(B) cells
df['diagnosis'].value_counts()

In [None]:
#Another method
df.groupby('diagnosis').size()

### **Visualizing the data**

In [None]:
#Visualization of data using matplotlib.pyplot library
import matplotlib.pyplot as plt
#Visualizing diagnosis using histogram
plt.hist(df['diagnosis'])
plt.show()

In [None]:
#Getting all values of column diagnosis
df["diagnosis"].values

In [None]:
#Putting the count of individual values in a list
data=['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B',
       'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
       'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'M', 'M', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'B', 'M',
       'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M',
       'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'M', 'B',
       'B', 'B', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'B']
List=[]
Mcount=0
Bcount=0
for x in data:
    if x=='M':
        Mcount=Mcount+1
    else:
        Bcount=Bcount+1
List.append(Mcount) 
List.append(Bcount)
List

In [None]:
#Visualizing it using piechart
plt.pie(List,labels=["Malignant","Benign"])
plt.show()

In [None]:
#Seeing relationship between radius_mean and diagnosis using scatter plot
import matplotlib.pyplot as plt
x=df['diagnosis']
y=df['radius_mean']
plt.scatter(x,y)
plt.title("Scatterplot of diagnosis and radius_mean")
plt.xlabel("Diagnosis")
plt.ylabel("Radius_mean")
plt.show()

In [None]:
#Seeing relationship between radius_se and diagnosis piechart
x=df["diagnosis"]
y=df["radius_se"]
plt.bar(x,y)
plt.title("Barchart of diagnosis and radius_se")
plt.xlabel("Diagnosis")
plt.ylabel("Radius_se")
plt.show()

In [None]:
#Visualization of data using seaborn library
import seaborn as sns
#Visualizing diagnosis using countplot
#A count plot is helpful when dealing with categorical values. It is used to plot the frequency of the different categories.
sns.countplot(df["diagnosis"])

In [None]:
#Visualizing data using boxplot
sns.boxplot(data=df)
#Here we got boxplot of all the columns on a one go,but its not that much clear

In [None]:
#We will divide different columns for a boxplot which will make it easier to understand
sns.boxplot(data=df.iloc[:,1:6])

In [None]:
sns.boxplot(data=df.iloc[:,6:11])

In [None]:
sns.boxplot(data=df.iloc[:,11:16])

In [None]:
sns.boxplot(data=df.iloc[:,16:21])

In [None]:
sns.boxplot(data=df.iloc[:,21:26])

In [None]:
sns.boxplot(data=df.iloc[:,26:31])

### **Encoding categorical data values**

In [None]:
#To display the values of column diagnosis
df.iloc[:,0].values

In [None]:
#Encoding using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df.iloc[:,0]=label_encoder.fit_transform(df.iloc[:,0].values)  
df.iloc[:,0].values

In [None]:
#Now checking data type of column diagnosis
df['diagnosis'].dtype

In [None]:
#To display first 5 rows of new data
df.head(5)

In [None]:
#Seeing relationship between radius_mean and diagnosis using regression plot
sns.regplot(x='diagnosis',y='radius_mean',data=df)

In [None]:
#Seaborn lets us plot multiple scatter plots.It pairs all the continuous data and plots their correlation. It also plots the distribution of the data.
sns.pairplot(df.iloc[:,0:6], hue='diagnosis')   
#The hue parameter determines which column in the data frame should be used for colour encoding.Basically used to split the bars.

### **Correlation of columns**

In [None]:
#Getting the correlation of columns by using corr() function
#corr() function tells us how one column can influence the other
df.corr()

In [None]:
#Finding the correlation of diagnosis and first 10 columns(1-10)
data1=df.iloc[:,0:11].corr()
data1

In [None]:
#Visualize the correlation of above data using heatmap
sns.heatmap(data1,annot=True,fmt='.0%')

In [None]:
#Finding the correlation of diagnosis and next 10 columns(11-20)
data2=df.iloc[:,[0,11,12,13,14,15,16,17,18,19,20]].corr()
data2

In [None]:
#Visualize the correlation of above data using heatmap
sns.heatmap(data2,annot=True,fmt='.0%')

In [None]:
#Finding the correlation of diagnosis and next 10 columns(21-30)
data3=df.iloc[:,[0,21,22,23,24,25,26,27,28,29,30]].corr()
data3

In [None]:
#Visualize the correlation of above data using heatmap
sns.heatmap(data3,annot=True,fmt='.0%')

# **MACHINE LEARNING**

**Basic Steps:-**
* Split the dataset into independent(X) and dependent(Y) datasets
* Split the dataset into training and testing
* Model Building
* Prediction
* Testing model accuracy


### **1.Split the dataset into independent(X) and dependent(Y) datasets**

In [None]:
#Dividing data into X and Y(converting into numpy)
X=df.iloc[:,1:31].values  #Independent dataset tells us features that detect if the patient has cancer or not
Y=df.iloc[:,0].values     #Dependent dataset tells us if the patient has cancer or not

In [None]:
print(X)

In [None]:
print(Y)

### **2.Split the dataset into training and testing**

In [None]:
#Splitting dataset into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

#### **Model 1:- Logistic Regression**

### **3.Model Building**

In [None]:
#Fitting the Logistic Regression model on X and Y
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression()  
model1.fit(X_train,Y_train)

### **4.Prediction**

In [None]:
#To print the prediction of this model 
pred1=model1.predict(X_test)
pred1

In [None]:
Y_test

### **5.Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test,pred1))

#### **Model 2:- Decision Tree Classifier**

### **Model Building**

In [None]:
#Fitting the Decision Tree Classifier model on X and Y
from sklearn.tree import DecisionTreeClassifier
model2=DecisionTreeClassifier(criterion='entropy',random_state=0)
model2.fit(X_train,Y_train)

### **Prediction**

In [None]:
#To print the prediction of this model
pred2=model2.predict(X_test)
pred2

In [None]:
Y_test

### **Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test,pred2))

#### **Model 3:- Random Forest Classifier**

### **Model Building**

In [None]:
#Fitting the Random Forest Classifier model on X and Y
from sklearn.ensemble import RandomForestClassifier
model3=RandomForestClassifier()
model3.fit(X_train,Y_train)

### **Prediction**

In [None]:
#To print the prediction of this model
pred3=model3.predict(X_test)
pred3

In [None]:
Y_test

### **Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,pred3)

## **Machine Learning on data with customized columns**

In [None]:
df1=df[['diagnosis','radius_mean','perimeter_mean','area_mean','concavity_mean','concave points_mean','radius_worst','perimeter_worst','area_worst','concave points_worst']]

In [None]:
print(df1)

### **1.Split the dataset into independent(X) and dependent(Y) datasets**

In [None]:
#Dividing data into X1 and Y1(converting into numpy)
X1=df1.iloc[:,1:10].values  #Independent dataset tells us features that detect if the patient has cancer or not
Y1=df1.iloc[:,0].values    #Dependent dataset tells us if the patient has cancer or not

In [None]:
print(X1)

In [None]:
print(Y1)

### **2.Split the dataset into training and testing**

In [None]:
#Splitting dataset into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train1,X_test1,Y_train1,Y_test1=train_test_split(X1,Y1,test_size=0.20,random_state=0)

In [None]:
X_train1.shape

In [None]:
X_test1.shape

In [None]:
Y_train1.shape

In [None]:
Y_test1.shape

#### **Model 1:- Logistic Regression**

### **3.Model Building**

In [None]:
#Fitting the Logistic Regression model on X1 and Y1
from sklearn.linear_model import LogisticRegression
model11= LogisticRegression()
model11.fit(X_train1,Y_train1)

### **4.Prediction**

In [None]:
#To print the prediction of this model 
pred11=model11.predict(X_test1)
pred11

In [None]:
Y_test1

### **5.Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
accuracy_score(Y_test1,pred11)

#### **Model 2:- Decision Tree Classifier**

### **Model Building**

In [None]:
#Fitting the Decision Tree Classifier model on X1 and Y1
from sklearn.tree import DecisionTreeClassifier
model22=DecisionTreeClassifier(random_state=0)
model22.fit(X_train1,Y_train1)

### **Prediction**

In [None]:
#To print the prediction of this model
pred22=model22.predict(X_test1)
pred22

In [None]:
Y_test1

### **Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
accuracy_score(Y_test1,pred22)

#### **Model 3:- Random Forest Classifier**

### **Model Building**

In [None]:
#Fitting the Random Forest Classifier model on X1 and Y1
from sklearn.ensemble import RandomForestClassifier
model33=RandomForestClassifier()
model33.fit(X_train1,Y_train1)

### **Prediction**

In [None]:
#To print the prediction of this model
pred33=model33.predict(X_test1)
pred33

In [None]:
Y_test1

### **Testing model accuracy**

In [None]:
#Testing model accuracy on test data
from sklearn.metrics import accuracy_score
accuracy_score(Y_test1,pred33)

# Predicting cancer using 9 features of which correlation lies between(42%-70%) 

In [None]:
df2=df[['texture_mean','compactness_mean','radius_se','perimeter_se','area_se','texture_worst','smoothness_worst','compactness_worst','concavity_worst']]
df2

In [None]:
X2=df.iloc[:,1:9]
Y2=df.iloc[:,0]

In [None]:
X2.shape

In [None]:
Y2.shape

In [None]:
#splitting into train test
from sklearn.model_selection import train_test_split
X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.20,random_state=0)

In [None]:
X_train2.shape


In [None]:
X_test2.shape

In [None]:
Y_train2.shape

In [None]:
Y_test2.shape

In [None]:
#fitting
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()
modelLR.fit(X_train2,Y_train2)

In [None]:
#Predicting
predLR= modelLR.predict(X_test2)
predLR

In [None]:
#Accuracy 
from sklearn.metrics import accuracy_score
accuracy_score(Y_test2,predLR)

In [None]:
#Fitting Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
modeltree=DecisionTreeClassifier()
modeltree.fit(X_train2,Y_train2)

In [None]:
#Predict
predtree=modeltree.predict(X_test2)
predtree

In [None]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy_score(Y_test2,predtree)

In [None]:
#fitting random forest
from sklearn.ensemble import RandomForestClassifier
modelRFC=RandomForestClassifier()
modelRFC.fit(X_train2,Y_train2)

In [None]:
#predicting
predRFC= modelRFC.predict(X_test2)
predRFC

In [None]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy_score(Y_test2,predRFC)