In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../datasets/Cryotherapy.csv')
df.head(5)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
0,1,35,12.0,5,1,100,0
1,1,29,7.0,5,1,96,1
2,1,50,8.0,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [4]:
from sklearn.model_selection import train_test_split

feature_columns = ['sex', 'age', 'Time', 'Number_of_Warts', 'Type', 'Area']
X = df[feature_columns]
y = df['Result_of_Treatment']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# HARD VOTING

In [6]:
estimators = []


dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=1)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(('LogisticRegression', logit_model))

In [7]:
# dt_model.fit(X_train, y_train)
# svm_model.fit(X_train, y_train)
# logit_model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, y_train)
    y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(y_test, y_pred))

DecisionTreeClassifier 0.8333333333333334
SVC 0.4444444444444444
LogisticRegression 0.9444444444444444


In [9]:
ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
ensemble_model.fit(X_train, y_train)
y_pred = ensemble_model.predict(X_test)
print("Classifier Accuracy using Hard Voting ", accuracy_score(y_test, y_pred))

Classifier Accuracy using Hard Voting  0.8333333333333334


In [10]:
print(len(estimators))

3


# SOFT VOTING

In [11]:
estimators = []

dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(("DecisionTree", dt_model))

svm_model = SVC(random_state=1, probability=True)
estimators.append(("supportVector", svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(("LogisticRegression", logit_model))



In [12]:
for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, y_train)
    y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(y_test, y_pred))

DecisionTreeClassifier 0.8333333333333334
SVC 0.4444444444444444
LogisticRegression 0.9444444444444444


In [13]:
ensemble_model = VotingClassifier(estimators=estimators, voting='soft')
ensemble_model.fit(X_train, y_train)
y_pred = ensemble_model.predict(X_test)
print("Classifier Accuracy using Soft Voting: ", accuracy_score(y_test, y_pred))

Classifier Accuracy using Soft Voting:  0.8888888888888888


# AVERAGING

In [14]:
df = pd.read_csv('../datasets/whitewines.csv')
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.62,0.24,1.1,0.039,6.0,62.0,0.9934,3.41,0.32,10.4,5
1,5.7,0.22,0.2,16.0,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6
2,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6
3,5.3,0.47,0.1,1.3,0.036,11.0,74.0,0.99082,3.48,0.54,11.2,4
4,6.4,0.29,0.21,9.65,0.041,36.0,119.0,0.99334,2.99,0.34,10.933333,6


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [16]:
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\
                   'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',\
                   'density', 'pH', 'sulphates', 'alcohol']
X = df[feature_columns]
y = df['quality']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [18]:
linreg_model = LinearRegression()
svr_model = SVR()
regression_tree = DecisionTreeRegressor()

In [19]:
linreg_model.fit(X_train, y_train)
svr_model.fit(X_train, y_train)
regression_tree.fit(X_train, y_train)

DecisionTreeRegressor()

In [20]:
linreg_pred = linreg_model.predict(X_test)
svr_model_pred = svr_model.predict(X_test)
regression_tree_pred = regression_tree.predict(X_test)

In [21]:
average_pred = (linreg_pred + svr_model_pred + regression_tree_pred) / 3

In [22]:
print("Classifier Accuracy using Averaging: ", accuracy_score(y_test, [int(round(i)) for i in  average_pred]))

Classifier Accuracy using Averaging:  0.5741496598639456


# WEIGHTED AVERAGING

In [23]:
df = pd.read_csv('../datasets/wisc_bc_data.csv')
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [24]:
X = df.iloc[:, 2: 32]
y = df['diagnosis']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [26]:
estimators = []

dt_model = DecisionTreeClassifier()
estimators.append(("DecisionTree", dt_model))

svm_model = SVC(probability=True)
estimators.append(("SupportVector", svm_model))

logit_model = LogisticRegression()
estimators.append(("LogisticRegression", logit_model))

In [27]:
dt_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
logit_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [28]:
dt_model_pred = dt_model.predict_proba(X_test)
svm_model_pred = svm_model.predict_proba(X_test)
logit_model_pred = logit_model.predict_proba(X_test)

In [29]:
weighted_avg_pred = (dt_model_pred * .3 + svm_model_pred * .4 + logit_model_pred * .3)

In [30]:
y_pred = pd.Series(['B' if np.argmax(i) == 0 else 'M' for i in weighted_avg_pred])

In [31]:
print("Classifier Accuracy using Weighted average: ", accuracy_score(y_test, y_pred))

Classifier Accuracy using Weighted average:  0.9473684210526315


In [32]:
dt_model_pred = pd.Series(['B' if np.argmax(i) == 0 else 'M' for i in dt_model_pred])
svm_model_pred = pd.Series(['B' if np.argmax(i) == 0 else 'M' for i in svm_model_pred])
logit_model_pred = pd.Series(['B' if np.argmax(i) == 0 else 'M' for i in logit_model_pred])

In [33]:
print("Individual Accuracy of Decision Tree: ", accuracy_score(y_test, dt_model_pred))
print("Individual Accuracy of Support Vector: ", accuracy_score(y_test, svm_model_pred))
print("Individual Accuracy of Logistic Regression : ", accuracy_score(y_test, logit_model_pred))


Individual Accuracy of Decision Tree:  0.9415204678362573
Individual Accuracy of Support Vector:  0.9239766081871345
Individual Accuracy of Logistic Regression :  0.9532163742690059
