## The breast cancer Wisconsin dataset (classification problem)

Let us build two predictive models on the following dataset:

(http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29)



In [None]:
# load the breast cancer Wisconsin dataset
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
type(cancer)

sklearn.utils.Bunch

Bunch is a Dictionary-like object in Scikit-Learn. The interesting attributes are: 
- ‘data’, the data to learn, 
- ‘target’, the classification labels, 
-‘target_names’, the meaning of the labels, 
- ‘feature_names’, the meaning of the features, and 
-‘DESCR’, the full description of the dataset, 
- ‘filename’, the physical location of breast cancer csv dataset (added in version 0.20).

In [None]:
list(cancer.feature_names)

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry',
 'worst fractal dimension']

In [None]:
list(cancer.target_names)

['malignant', 'benign']

## GaussianNB method of Naive Bayes

In [None]:
# split the data into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.25, random_state=0)

In [None]:
# Let us apply Feature Scaling to our data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Use GaussianNB method of Naive Bayes class to build a prediction model on training data
# Obtain the accuracy of your model predictions on test data
# Create a confusion matrix as part of your output

# ANSWER:

#1. import your model
#2. instantiate your model
#3. fit your model to training data
#4. evaluate your trained model on test data

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
 
#Model Accuracy, Import scikit-learn metrics 
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
 
# Confusion matrix 
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)





0.916083916083916


array([[47,  6],
       [ 6, 84]])

## SVM


In [None]:
# split the data into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.25, random_state=0)

In [None]:
# Let us apply Feature Scaling to our data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Use Support vector Machines (SVM) class in Scikit-Learn to build a prediction model on training data
# Obtain the accuracy of your model predictions on test data
# Create a confusion matrix as part of your output

# ANSWER:

#Import svm model
from sklearn import svm

#Create a svm Classifier
svmclf = svm.SVC(kernel='linear')

#Train the model using the training sets
svmclf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svmclf.predict(X_test)
 
#Model Accuracy, Import scikit-learn metrics 
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
 
# Confusion matrix 
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)







0.972027972027972


array([[51,  2],
       [ 2, 88]])

## Diabetes dataset

https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

The dataset contains 10 features (that have already been mean centered and scaled) and a target value: a measure of disease progression one year after baseline.

In [None]:
# load the dataset
import pandas as pd
from sklearn.datasets import load_diabetes
X, y = load_diabetes(return_X_y=True)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
print(X.head())
print(y.head())

          0         1         2         3         4         5         6  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

          7         8         9  
0 -0.002592  0.019908 -0.017646  
1 -0.039493 -0.068330 -0.092204  
2 -0.002592  0.002864 -0.025930  
3  0.034309  0.022692 -0.009362  
4 -0.002592 -0.031991 -0.046641  
       0
0  151.0
1   75.0
2  141.0
3  206.0
4  135.0


In [None]:
# setting up training set and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [None]:
# fit a linear regression model on training data and determine training set score and test set score
# ANSWER:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
#training score, testing score: 
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))


0.555437148935302
0.35940090989715534
