In [1]:
# 第一步: 获取数据
# 第二步: 合并表
# 第三步: 找到user_id和aisle之间的关系
# 第四步: PCA降维

In [2]:
import pandas as pd

In [3]:
# 获取数据
order_products = pd.read_csv("./instacart/order_products__prior.csv")
products = pd.read_csv("./instacart/products.csv")
orders = pd.read_csv("./instacart/orders.csv")
aisles = pd.read_csv("./instacart/aisles.csv")

In [4]:
# 进行合并表
tab1 = pd.merge(aisles,products,on=["aisle_id","aisle_id"]) #默认就是内连接

In [5]:
tab2 = pd.merge(tab1,order_products,on=["product_id","product_id"])

In [6]:
tab3 = pd.merge(tab2,orders,on=["order_id","order_id"])

In [7]:
tab3.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0,114082,prior,26,0,20,1.0
1,1,prepared soups salads,209,Italian Pasta Salad,20,192465,2,1,119977,prior,2,0,16,3.0
2,1,prepared soups salads,209,Italian Pasta Salad,20,195206,18,1,1519,prior,7,2,9,5.0
3,1,prepared soups salads,209,Italian Pasta Salad,20,227717,1,1,161125,prior,7,2,11,11.0
4,1,prepared soups salads,209,Italian Pasta Salad,20,260072,13,0,12012,prior,5,5,11,11.0


In [8]:
# 使用交叉表找到user_id和aisle之间的关系
table = pd.crosstab(tab3["user_id"],tab3["aisle"]) # 新的表会列出同一个用户不同购物通道的频次

In [9]:
data = table[0:10000] # 数据量太大,现在只取前10000条数据

In [10]:
# 通过PCA降维减少数据的冗余 简化数据\减少噪声\提高效率
from sklearn.decomposition import PCA

In [11]:
transfer = PCA(n_components=0.95)  # 比较常用小数
data_new = transfer.fit_transform(data)

In [12]:
data_new.shape  # 发现降维很多 但是信息损失很少

(10000, 42)

In [13]:
# 没有目标值 所以直接进入预估器流程
from sklearn.cluster import KMeans

In [14]:
estimator = KMeans(n_clusters=3) # 指定分为三簇

In [15]:
estimator.fit(data_new) # 训练模型

In [18]:
y_predict = estimator.predict(data_new)

In [22]:
y_predict[0:300]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2,
       2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0,
       2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0,
       0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2], d

In [23]:
# 模型评估 - 轮廓系数
from sklearn.metrics import silhouette_score

In [24]:
silhouette_score(data_new,y_predict) # 0.5396819903993837 还算可以了 范围是-1到1

np.float64(0.5396819903993837)