In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import sklearn
import seaborn
metric = sklearn.metrics.accuracy_score

In [None]:

#Load data

data = pd.read_csv('spam.csv',encoding='latin-1')
data.loc[data['v1'].eq('ham'),'v1'] = 1
data.loc[data['v1'].eq('spam'),'v1'] = 0
data['v1']=data['v1'].astype('float64')


# Balance dataset
n = 747
sample_yes = data.ix[data.v1 == 1].sample(n=n, replace=False, random_state=0)
sample_no = data.ix[data.v1 == 0].sample(n=n, replace=False, random_state=0)
data = pd.concat([sample_yes, sample_no])
data = data.sample(frac=1).reset_index(drop=True)



In [None]:

train_data = data[:500]
test_data = data[500:]


In [None]:
#Vectorize the text

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data.v2)
y_train = train_data.v1

X_test = vectorizer.transform(test_data.v2)
y_test = test_data.v1


In [None]:
from sklearn.linear_model import Perceptron
model_1 = Perceptron(max_iter=1000, tol=1e-3)
model_1.fit(X_train, y_train)


model_2 = RandomForestClassifier(n_estimators=10, n_jobs=-1)
model_2.fit(X_train, y_train)


from sklearn.ensemble import AdaBoostClassifier
model_3 = AdaBoostClassifier(n_estimators=3)
model_3.fit(X_train, y_train)


In [None]:
# Define Ensemble

#ensemble = RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(vectorized_text, train_data.v1)
ensemble = [model_1,model_2,model_3]
n_models = len(ensemble)

predictions = np.array([h.predict(X_train) for h in ensemble])
# scale hij to [-1/N, 1/N]
predictions =(predictions* 1/n_models)

λ = 1


In [None]:
# Define coeffitients

w = predictions @ predictions.T
wii = X_train.shape[0] / (n_models ** 2) + λ - 2 * predictions @ y_train
w[np.diag_indices_from(w)] = wii
W = {}
for i in range(n_models):
    for j in range(i, n_models):
        W[(i, j)] = w[i, j]

In [None]:
import dimod
#sampler = dimod.SimulatedAnnealingSampler()
#response = sampler.sample_qubo(W, num_reads=100)
#weights = list(response.first.sample.values())

In [None]:
# Create Ising model

h, J, offset = dimod.qubo_to_ising(W)
from qiskit.quantum_info import Pauli
from qiskit.aqua import Operator

num_nodes = len(w)
pauli_list = []
for i in range(num_nodes):
    wp = np.zeros(num_nodes)
    vp = np.zeros(num_nodes)
    vp[i] = 1
    pauli_list.append([h[i], Pauli(vp, wp)])
    for j in range(i+1, num_nodes):
        if w[i, j] != 0:
            wp = np.zeros(num_nodes)
            vp = np.zeros(num_nodes)
            vp[i] = 1
            vp[j] = 1
            pauli_list.append([J[i, j], Pauli(vp, wp)])
ising_model = Operator(paulis=pauli_list)

In [None]:
# Optimize Ising model

from qiskit.aqua import get_aer_backend, QuantumInstance
from qiskit.aqua.algorithms import QAOA
from qiskit.aqua.components.optimizers import COBYLA
p = 1
optimizer = COBYLA()
qaoa = QAOA(ising_model, optimizer, p, operator_mode='matrix')
backend = get_aer_backend('statevector_simulator')
quantum_instance = QuantumInstance(backend, shots=100)
result = qaoa.run(quantum_instance)

In [None]:
def predict(models, weights, X):

    n_data = X.shape[0]
    T = 0
    y = np.zeros(n_data)
    for i, h in enumerate(models):
        #print('type of predict')
        y0 = weights[i] * h.predict(X)  # prediction of weak classifier
        y += y0
        T += np.sum(y0)

    y = np.sign(y - T / (n_data*len(models)))

    return y

In [None]:
# Get optimized weights of estimators
k = np.argmin(result['eigvecs'][0])
weights = np.zeros(num_nodes)
for i in range(num_nodes):
    weights[i] = k % 2
    k >>= 1

In [None]:
#vectorized_test = vectorizer.transform(test_data.v2)
print('accuracy (test): %5.2f'%(metric( y_test, predict(ensemble, weights, X_test))))
print('accuracy (train): %5.2f'%(metric( y_train, predict(ensemble, weights, X_train))))
#print('accuracy (test): %5.2f'%(metric( test_data.v1, predict(ensemble, weights, vectorized_text))))