In [None]:
# 變數前處理

%run preprocessing.py
print ("shape of usersXprod :", usersXprod.shape)

p = pd.read_csv("products.csv")
p = p.drop(["product_name"], axis=1)
usersXprod = pd.merge(usersXprod, p, how='left', on=['product_id'])


In [None]:
usersXprod.head()

In [None]:
train = usersXprod[(usersXprod.eval_set == 'train')]
test = usersXprod[(usersXprod.eval_set == 'test')]

print ("size of train :", train.shape[0])
print ("size of test :", test.shape[0])

In [None]:
X = train.drop(['user_id','product_id', 'order_id', 'eval_set', 'reordered'], axis=1)
y = train[["reordered"]]

print ("size of features :", X.shape[1])
print ("size of class :", len(set(y)))

In [None]:
# inline plotting instead of popping out
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid', context='notebook')

sns.pairplot(train, hue="reordered", size=1)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Z-normalize data
sc = StandardScaler()
Z = sc.fit_transform(X)

# Estimate the correlation matrix
P = np.dot(Z.T, Z) / train.shape[0]

sns.set(font_scale=1.5)
hm = sns.heatmap(P, cbar=True, square=True)

plt.tight_layout()
plt.show()

sns.reset_orig()

In [None]:
eigen_vals, eigen_vecs = np.linalg.eigh(P)

tot = sum(np.abs(eigen_vals))
var_exp = [(i / tot) for i in sorted(np.abs(eigen_vals), reverse=True)]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(1, eigen_vals.size + 1), var_exp, alpha=0.5, align='center',
        label='Individual')
plt.step(range(1, eigen_vals.size + 1), cum_var_exp, where='mid',
         label='Cumulative')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()


In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)

W = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis]))


In [None]:
Z_pca = Z.dot(W)

colors = ['r', 'b']
markers = ['s', 'x']
for l, c, m in zip(np.unique(y.values), colors, markers):
    plt.scatter(Z_pca[y.values==l, 0], Z_pca[y.values==l, 1], c=c, label=l, marker=m)

plt.title('Z_pca')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()
