Import all the libraries you require in the cell below.


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Enter your code here
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (RandomForestClassifier, VotingClassifier,
                              BaggingClassifier, AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Implement the Ensemble methods learnt in class and compare their accuarcies.

The dataset you are going to be using for homework is the **Wisconsin Breast Cancer dataset (cancer.csv)**

The dataset contains a total number of 10 features labeled in either benign or malignant classes. The features have 699 instances out of which 16 feature values are missing. The dataset only contains numeric values.

Attribute Information:

1. Sample code number: id number
2. Clump Thickness: 1 - 10
3. Uniformity of Cell Size: 1 - 10
4. Uniformity of Cell Shape: 1 - 10
5. Marginal Adhesion: 1 - 10
6. Single Epithelial Cell Size: 1 - 10
7. Bare Nuclei: 1 - 10
8. Bland Chromatin: 1 - 10
9. Normal Nucleoli: 1 - 10
10. Mitoses: 1 - 10
11. Class: (2 for benign, 4 for malignant) (**target variable**)

For more information: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)

### 1. Read the dataset into variable called '**data**' (1 mark)

In [None]:
pd.set_option('display.max_columns', 100)
# Enter your code here

data = pd.read_csv('cancer.csv')

### **Preprocessing**: Data needs to be preprocessed before implementing ensemble methods. It is done for you here. 
### Run the below code first and then answer the questions from 2 - 7.

#### Deleting unnecessary columns: The column "Sample code number" is just an indicator and it's of no use in the modeling. So, let's drop it:


In [None]:
data.drop(['Sample Code Number'],axis = 1, inplace = True)

#### Handling missing values : 
As mentioned earlier, the dataset contains missing values. The column named "Bland Chromatin" contains them. The missing values are represneted as "?". 

Replace those "?"s with 0's and impute them with Mean Imputation

In [None]:
data['Bland Chromatin']

0       1
1      10
2       2
3       4
4       1
       ..
694     2
695     1
696     3
697     4
698     5
Name: Bland Chromatin, Length: 699, dtype: object

In [None]:
data.replace('?',0, inplace=True)

In [None]:
# Convert the DataFrame object into NumPy array otherwise you will not be able to impute
values = data.values
# Now impute it
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputedData = imputer.fit_transform(values)

#### Normalizing the data:
Ranges of the features of the dataset are not the same. This may cause a problem. A small change in a feature might not affect the other. To address this problem, normalize the ranges of the features to a uniform range, in this case, 0 - 1.

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
normalizedData = scaler.fit_transform(imputedData)
cols = ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bland Chromatin', 'Bare Nuclei', 'Normal Nucleoli', 'Mitosis','Class']
normalizedData = pd.DataFrame(normalizedData, columns=cols)
print(normalizedData.head())

   Clump Thickness  Uniformity of Cell Size  Uniformity of Cell Shape  \
0         0.444444                 0.000000                  0.000000   
1         0.444444                 0.333333                  0.333333   
2         0.222222                 0.000000                  0.000000   
3         0.555556                 0.777778                  0.777778   
4         0.333333                 0.000000                  0.000000   

   Marginal Adhesion  Single Epithelial Cell Size  Bland Chromatin  \
0           0.000000                     0.111111              0.1   
1           0.444444                     0.666667              1.0   
2           0.000000                     0.111111              0.2   
3           0.000000                     0.222222              0.4   
4           0.222222                     0.111111              0.1   

   Bare Nuclei  Normal Nucleoli  Mitosis  Class  
0     0.222222         0.000000      0.0    0.0  
1     0.222222         0.111111      0.0

### Data preprocessing is done and now you will answer the below questions using the **normalizedData**: 

### 2. Split the data into test and training data with test size - 30%. Compute the baseline classification accuracy for X_train. (3 marks)

In [None]:
# Enter your code here
y = normalizedData['Class'] # goal is to predict class, assign Y
X = normalizedData.iloc[:, :-1] # all other variables are features, assign X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

dc = DummyClassifier(strategy='most_frequent')
dc.fit(X_train,y_train)
baseline = dc.score(X_test,y_test)
print('Training Accuracy =', baseline)

Training Accuracy = 0.638095238095238


### 3.  Bagging : Build a generic Bagging ensemble and print the accuracy (4 marks)
---


Hyperparameters:

Base estimator = DecisionTreeClassifier

n_estimators = 10

random_state = 42

---


In [None]:
# Generic Bagging model
# Enter your code here
bagg_model = BaggingClassifier(random_state=42)
# default base_estimator is DecisionTreeClassifier and default n_estimators is 10
bagg_model.fit(X_train,y_train)
bagg_pred = bagg_model.predict(X_test)
bagg_acc = accuracy_score(y_test,bagg_pred)
print('Bagging Model Accuracy =',bagg_acc)

Bagging Model Accuracy = 0.9523809523809523


### 4. RandomForest : (7 marks)
#### a) Build a Random Forest model and print the accuracy (4 marks)
---

Constructor arguments: 


n_estimators = 100, max_features = 7 and random_state = 42 


---




In [None]:
# Random Forest model
# Enter your code here
rf = RandomForestClassifier(n_estimators=100,max_features=7,random_state=42)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test,rf_pred)

print('Random Forest Accuracy =', rf_acc)

Random Forest Accuracy = 0.9523809523809523


####  b) Calculate the top 3 important features for the above **RandomForest** model and print them (3 marks)

In [None]:
# Top 3 features for RandomForest
# Enter your code here

rf_importances = rf.feature_importances_
features = X_train.columns
rf_rawImportanceValues = pd.DataFrame({'Features':features, 'Importance':rf_importances})
rf_importanceValues = rf_rawImportanceValues.sort_values('Importance',ascending=False)
print(rf_importanceValues[:3])

                   Features  Importance
1   Uniformity of Cell Size    0.389888
2  Uniformity of Cell Shape    0.230522
5           Bland Chromatin    0.195402


### 5. Boosting: (7 marks)
#### a) Build an AdaBoost model with training data and print the accuracy (4 marks)
---

Hyperparameters:

Base estimator = DecisionTreeClassifier, max_depth = 4

n_estimators = 200

random_state = 42

learning_rate = 0.05


---









In [None]:
# AdaBoost Classification
# Enter your code here
base_est = DecisionTreeClassifier(max_depth=4)
boost = AdaBoostClassifier(base_est, n_estimators=200, random_state=42, learning_rate=0.05)
boost.fit(X_train, y_train)
boost_acc = accuracy_score(y_test, boost.predict(X_test))

print('AdaBoost Accuracy =', boost_acc)

AdaBoost Accuracy = 0.9380952380952381


#### b) Calculate the top 3 important features for the above **AdaBoost** model and print them (3 marks)

In [None]:
# Top 3 features for AdaBoost
# Enter your code here

boost_importances = boost.feature_importances_
features = X_train.columns
boost_rawImportanceValues = pd.DataFrame({'Features':features, 'Importance':boost_importances})
boost_importanceValues = boost_rawImportanceValues.sort_values('Importance',ascending=False)
print(boost_importanceValues[:3])

                  Features  Importance
3        Marginal Adhesion    0.307381
1  Uniformity of Cell Size    0.169565
7          Normal Nucleoli    0.126940


### 6. Voting : Using a voting classifier, build an ensemble of RandomForestClassifier, DecisionTreeClassifier, Support Vector Machine and Logistic Regression. (7 marks)


---


Use max_depth = 4, n_estimators = 200, voting = soft

In [None]:
# Voting Ensemble for Classification
# Enter your code here
rfClf = RandomForestClassifier(max_depth=4, n_estimators=200) # Random Forest
dtcClf = DecisionTreeClassifier(max_depth=4) # Decision Tree
svmClf = SVC(probability=True) # Support Vector Machine
logClf = LogisticRegression() # Logistic Regression

voting = VotingClassifier(estimators=[('rf', rfClf),('dtc', dtcClf),('svm', svmClf),('log', logClf)], voting='soft') 
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=4,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
           

### 7. Mention the best model among the above 4 models and its accuracy (1 mark)

In [None]:
# Write your answer here

# fit individual models
rfClf.fit(X_train,y_train)
dtcClf.fit(X_train,y_train)
svmClf.fit(X_train,y_train)
logClf.fit(X_train,y_train)
 
# print accuracy scores of individual estimators
print('Random Forest Accuracy =', accuracy_score(y_test, rfClf.predict(X_test)))
print('Decision Tree Accuracy =', accuracy_score(y_test, dtcClf.predict(X_test)))
print('Support Vector Machine Accuracy =', accuracy_score(y_test, svmClf.predict(X_test)))
print('Logistic Regression Accuracy =', accuracy_score(y_test, logClf.predict(X_test)),'\n')

# predict and print accuracy of voting classifier
voting_pred = voting.predict(X_test)
print('Voting Accuracy =', accuracy_score(y_test, voting_pred))

# It seems like Random Forest is the only model with an accuracy greater than the VotingClassifier.
# It also has the best accuracy out of all of the models.

Random Forest Accuracy = 0.9619047619047619
Decision Tree Accuracy = 0.9476190476190476
Support Vector Machine Accuracy = 0.9571428571428572
Logistic Regression Accuracy = 0.9476190476190476 

Voting Accuracy = 0.9523809523809523
