In [6]:
install.packages(c("tidyverse", "tm", "cluster", "factoextra"))

还安装依赖关系‘rbibutils’, ‘Deriv’, ‘microbenchmark’, ‘Rdpack’, ‘doBy’, ‘SparseM’, ‘MatrixModels’, ‘minqa’, ‘nloptr’, ‘reformulas’, ‘RcppEigen’, ‘later’, ‘carData’, ‘Formula’, ‘pbkrtest’, ‘quantreg’, ‘lme4’, ‘httpuv’, ‘promises’, ‘estimability’, ‘numDeriv’, ‘corrplot’, ‘viridis’, ‘car’, ‘DT’, ‘ellipse’, ‘emmeans’, ‘flashClust’, ‘leaps’, ‘multcompView’, ‘scatterplot3d’, ‘ggsci’, ‘cowplot’, ‘ggsignif’, ‘gridExtra’, ‘polynom’, ‘rstatix’, ‘plyr’, ‘NLP’, ‘slam’, ‘BH’, ‘dendextend’, ‘FactoMineR’, ‘ggpubr’, ‘reshape2’, ‘ggrepel’





下载的二进制程序包在
	/var/folders/ty/cy038g3n0b76wc7182ktv28r0000gn/T//Rtmp3YVqLY/downloaded_packages里


In [1]:
# 安装并加载必要的包
library(tidyverse)
library(tm)
library(cluster)
library(factoextra)


── [1mAttaching core tidyverse packages[22m ───────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
载入需要的程序包：NLP


载入程序包：‘NLP’


The following object is masked fro

In [2]:
# 加载数据
citibike_1 <- read.csv("./data/2019-citibike-tripdata/10_October/201910-citibike-tripdata_1.csv")
citibike_2 <- read.csv("./data/2019-citibike-tripdata/10_October/201910-citibike-tripdata_2.csv")
citibike_3 <- read.csv("./data/2019-citibike-tripdata/10_October/201910-citibike-tripdata_3.csv")

# 合并数据
citibike_oct <- bind_rows(citibike_1, citibike_2, citibike_3)

In [3]:
# 加载 janitor 包（如果尚未安装）
# install.packages("janitor")
library(janitor)

# 清理列名
citibike_oct <- citibike_oct %>%
  clean_names()  # 将列名转换为 snake_case 格式

# 查看清理后的列名
colnames(citibike_oct)



载入程序包：‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test




In [4]:
library(dplyr)
library(tidyr)

# 转换时间格式
citibike_oct <- citibike_oct %>%
  mutate(starttime = as.POSIXct(starttime, format="%Y-%m-%d %H:%M:%S"),
         stoptime = as.POSIXct(stoptime, format="%Y-%m-%d %H:%M:%S"),
         start_hour = as.POSIXct(format(starttime, "%Y-%m-%d %H:00:00"), format="%Y-%m-%d %H:%M:%S"),
         stop_hour = as.POSIXct(format(stoptime, "%Y-%m-%d %H:00:00"), format="%Y-%m-%d %H:%M:%S"))

# 计算每个站点的每小时进站和出站频率
station_hourly <- citibike_oct %>%
  group_by(start_station_id, start_hour) %>%
  summarise(egress_count = n(), .groups = "drop") %>%
  rename(station_id = start_station_id, time_hour = start_hour) %>%
  bind_rows(
    citibike_oct %>%
      group_by(end_station_id, stop_hour) %>%
      summarise(ingress_count = n(), .groups = "drop") %>%
      rename(station_id = end_station_id, time_hour = stop_hour)
  ) %>%
  group_by(station_id, time_hour) %>%
  summarise(egress = sum(egress_count, na.rm = TRUE),
            ingress = sum(ingress_count, na.rm = TRUE), .groups = "drop")

# 填补时间序列的空缺值
station_hourly <- station_hourly %>%
  complete(station_id, time_hour = seq(min(time_hour), max(time_hour), by = "hour"),
           fill = list(egress = 0, ingress = 0))


In [5]:
# 添加星期和小时信息
station_hourly <- station_hourly %>%
  mutate(weekday = weekdays(as.Date(time_hour)),
         hour = as.numeric(format(as.POSIXct(time_hour), "%H"))) %>%
  group_by(station_id, weekday, hour) %>%
  summarise(egress = sum(egress, na.rm = TRUE),
            ingress = sum(ingress, na.rm = TRUE)) %>%
  ungroup()

# 转换为宽表格式
weekly_profile <- station_hourly %>%
  pivot_wider(names_from = c(weekday, hour), values_from = c(egress, ingress), values_fill = 0)


[1m[22m`summarise()` has grouped output by 'station_id', 'weekday'. You can override using the `.groups` argument.


In [6]:
write.csv(weekly_profile, "weekly_profile.csv", row.names = FALSE)

In [None]:
# 计算TF-IDF权重
# TF-IDF 函数实现
tf_idf <- function(x, n_docs) {
  if (sum(x) == 0) {
    # 如果列全为零，返回全零向量
    return(rep(0, length(x)))
  }
  tf <- x / sum(x)  # 计算词频
  idf <- log(n_docs / (1 + sum(x > 0)))  # 计算逆文档频率
  tf * idf  # 计算TF-IDF
}

n_docs <- nrow(weekly_profile)

# 提取特征矩阵
tfidf_matrix <- weekly_profile %>%
  select(-station_id) %>%
  as.matrix()

# 应用TF-IDF
tfidf_result <- apply(tfidf_matrix, 1, function(row) {
  result <- tf_idf(row, n_docs)
  if (length(result) != ncol(tfidf_matrix)) {
    stop("The length of the result from tf_idf does not match the number of features.")
  }
  return(result)
})

# 转置矩阵，使行对应站点，列对应特征
tfidf_result <- t(tfidf_result)

# 将结果转换为数据框
weekly_profile_tfidf <- as.data.frame(tfidf_result)

# 添加站点ID
weekly_profile_tfidf$station_id <- weekly_profile$station_id

# 提取 station_id 列
station_id <- weekly_profile_tfidf$station_id

# 计算余弦相似性
cosine_similarity <- function(x) {
  x <- as.matrix(x)  # 确保输入是矩阵
  sim <- x %*% t(x) / (sqrt(rowSums(x^2) %*% t(rowSums(x^2))))
  return(sim)
}

distance_matrix <- as.dist(1 - cosine_similarity(weekly_profile_tfidf[,-ncol(weekly_profile_tfidf)]))

# 分层k均值聚类
set.seed(123)
hkmeans_result <- hkmeans(weekly_profile_tfidf[,-ncol(weekly_profile_tfidf)], k = 4)

# 将聚类结果添加到数据中
weekly_profile_tfidf$cluster <- hkmeans_result$cluster

# 创建最终数据框，只保留 station_id 和 cluster 列
result <- data.frame(station_id = station_id, cluster = weekly_profile_tfidf$cluster)

# 查看结果
print(result)


In [None]:
hkmeans_result

In [13]:
str(hkmeans_result)


List of 11
 $ cluster     : int [1:848] 1 1 1 1 1 1 1 1 1 1 ...
 $ centers     : num [1:4, 1:336] 0.000609 0 0 0 0.000314 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:4] "1" "2" "3" "4"
  .. ..$ : chr [1:336] "egress_星期一_0" "egress_星期一_1" "egress_星期一_2" "egress_星期一_3" ...
 $ totss       : num 320
 $ withinss    : num [1:4] 210 0 0 0
 $ tot.withinss: num 210
 $ betweenss   : num 110
 $ size        : int [1:4] 845 1 1 1
 $ iter        : int 1
 $ ifault      : int 0
 $ data        :'data.frame':	848 obs. of  336 variables:
  ..$ egress_星期一_0  : num [1:848] 0 0.000191 0.000395 0.00182 0.000305 ...
  ..$ egress_星期一_1  : num [1:848] 0.000419 0.000381 0.000395 0 0.000122 ...
  ..$ egress_星期一_2  : num [1:848] 1.4e-04 0.0 0.0 0.0 6.1e-05 ...
  ..$ egress_星期一_3  : num [1:848] 0.000279 0 0.000395 0 0.000122 ...
  ..$ egress_星期一_4  : num [1:848] 0.000419 0 0 0.00026 0 ...
  ..$ egress_星期一_5  : num [1:848] 0.000698 0.000191 0 0 0.000366 ...
  ..$ egress_星期一_6  : num [1:848] 0.001116 0.

In [10]:
# 提取 station_id 列
station_id <- weekly_profile_tfidf$station_id

# 计算余弦相似性
cosine_similarity <- function(x) {
  x <- as.matrix(x)  # 确保输入是矩阵
  sim <- x %*% t(x) / (sqrt(rowSums(x^2) %*% t(rowSums(x^2))))
  return(sim)
}

distance_matrix <- as.dist(1 - cosine_similarity(weekly_profile_tfidf[,-1]))

# 分层k均值聚类
set.seed(123)
hkmeans_result <- hkmeans(weekly_profile_tfidf[,-1], k = 4)

# 将聚类结果添加到数据中
weekly_profile_tfidf$cluster <- hkmeans_result$cluster

# 确保 cbind 的结果是数据框
weekly_profile_tfidf <- data.frame(station_id = station_id, cluster = weekly_profile_tfidf$cluster)


ERROR: Error in weekly_profile_tfidf$station_id: $ operator is invalid for atomic vectors


In [8]:
weekly_profile_tfidf <- as.data.frame(do.call(cbind, weekly_profile_tfidf))

ERROR: Error in do.call(cbind, weekly_profile_tfidf): 第二个参数必需为列表


In [9]:
# 提取 cluster 列
cluster <- weekly_profile_tfidf$cluster

# 组合 station_id 和 cluster 列
result <- data.frame(station_id = station_id, cluster = cluster)


ERROR: Error in weekly_profile_tfidf$cluster: $ operator is invalid for atomic vectors


In [10]:
colnames(weekly_profile_tfidf)

In [None]:
# 绘制热力图
heatmap_data <- weekly_profile_tfidf %>%
  group_by(cluster) %>%
  summarise(across(starts_with("V"), mean))

In [23]:
# 将聚类结果与特征数据合并
heatmap_data <- weekly_profile_tfidf %>%
  select(-cluster) %>%  # 去掉原来的 cluster 列，避免冲突
  inner_join(result, by = "station_id")  # 合并聚类结果

# 将热力图数据转换为矩阵
heatmap_matrix <- as.matrix(heatmap_data[,-c(1, ncol(heatmap_data))])  # 去掉 station_id 和 cluster 列

# 绘制热力图，按 cluster 给行分组着色
heatmap(heatmap_matrix, Rowv = as.dendrogram(hclust(dist(heatmap_matrix))), 
        Colv = NA, scale = "column", 
        col = heat.colors(256), margins = c(5, 10))


ERROR: Error: 找不到对象'result'


In [14]:
nrow(hkmeans_result)  # 查看行数
ncol(hkmeans_result)  # 查看列数


NULL

NULL

In [15]:
str(hkmeans_result)

List of 11
 $ cluster     : Named int [1:336] 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "names")= chr [1:336] "egress_星期一_0" "egress_星期一_1" "egress_星期一_2" "egress_星期一_3" ...
 $ centers     : num [1:4, 1:848] 0.00285 0.00558 0.00684 0.00656 0.00306 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:4] "1" "2" "3" "4"
  .. ..$ : chr [1:848] "" "" "" "" ...
 $ totss       : num 321
 $ withinss    : num [1:4] 211 0 0 0
 $ tot.withinss: num 211
 $ betweenss   : num 110
 $ size        : int [1:4] 333 1 1 1
 $ iter        : int 1
 $ ifault      : int 0
 $ data        : num [1:336, 1:848] 0 0.000419 0.00014 0.000279 0.000419 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:336] "egress_星期一_0" "egress_星期一_1" "egress_星期一_2" "egress_星期一_3" ...
  .. ..$ : chr [1:848] "" "" "" "" ...
 $ hclust      :List of 7
  ..$ merge      : int [1:335, 1:2] -28 -76 -173 -196 -244 -29 -4 -3 -75 -122 ...
  ..$ height     : num [1:335] 0.0127 0.015 0.0154 0.017 0.0182 ...
  ..$ order      : int [1:336]

In [17]:
cluster_assignments <- hkmeans_result$cluster
head(cluster_assignments)

In [19]:
head(weekly_profile)

station_id,egress_星期一_0,egress_星期一_1,egress_星期一_2,egress_星期一_3,egress_星期一_4,egress_星期一_5,egress_星期一_6,egress_星期一_7,egress_星期一_8,⋯,ingress_星期日_14,ingress_星期日_15,ingress_星期日_16,ingress_星期日_17,ingress_星期日_18,ingress_星期日_19,ingress_星期日_20,ingress_星期日_21,ingress_星期日_22,ingress_星期日_23
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
72,0,3,1,2,3,5,8,29,89,⋯,33,32,26,36,19,17,11,8,8,5
79,1,2,0,0,0,1,5,16,28,⋯,23,12,14,22,12,24,11,3,3,2
82,1,1,0,1,0,0,3,15,16,⋯,8,3,19,14,4,6,6,1,2,2
83,7,0,0,0,1,0,0,1,7,⋯,14,15,23,19,10,8,5,3,1,3
116,5,2,1,2,0,6,24,60,120,⋯,17,37,44,36,39,36,27,34,20,14
119,0,1,2,0,0,3,5,6,5,⋯,1,3,7,4,11,2,0,4,0,0


In [20]:
head(hkmeans_result)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,⋯,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
1,0.002850902,0.003061431,0.003386343,0.003141341,0.002803691,0.003400709,0.003156891,0.002929704,0.002853337,0.003053237,⋯,0.00704999,0.007054338,0.01365062,0.0,0.006523785,0.004741602,0.01116869,0.005446673,0.009742396,0.0070155
2,0.005581952,0.004004599,0.003945599,0.007800646,0.003540635,0.005837299,0.004454427,0.002339548,0.003080373,0.001713105,⋯,0.0237136,0.0,0.0,0.0,0.0,0.028708248,0.0,0.013238994,0.0,0.0
3,0.006837891,0.008009199,0.010653117,0.004940409,0.006409771,0.009728832,0.005939236,0.007320521,0.006964322,0.004711039,⋯,0.0711408,0.0,0.0,0.0,0.016583362,0.00574165,0.41324146,0.033097486,0.202763625,0.0
4,0.006558793,0.007627808,0.005918398,0.003900323,0.00628768,0.003891533,0.007918981,0.005056442,0.005022348,0.005995868,⋯,0.0237136,0.049980734,0.0,6.049733,0.0,0.011483299,0.41324146,0.0,0.40552725,0.0


In [21]:
write.csv(weekly_profile, "weekly_profile.csv", row.names = FALSE)

In [15]:
head(tfidf_matrix)

egress_星期一_0,egress_星期一_1,egress_星期一_2,egress_星期一_3,egress_星期一_4,egress_星期一_5,egress_星期一_6,egress_星期一_7,egress_星期一_8,egress_星期一_9,⋯,ingress_星期日_14,ingress_星期日_15,ingress_星期日_16,ingress_星期日_17,ingress_星期日_18,ingress_星期日_19,ingress_星期日_20,ingress_星期日_21,ingress_星期日_22,ingress_星期日_23
0,3,1,2,3,5,8,29,89,67,⋯,33,32,26,36,19,17,11,8,8,5
1,2,0,0,0,1,5,16,28,17,⋯,23,12,14,22,12,24,11,3,3,2
1,1,0,1,0,0,3,15,16,18,⋯,8,3,19,14,4,6,6,1,2,2
7,0,0,0,1,0,0,1,7,20,⋯,14,15,23,19,10,8,5,3,1,3
5,2,1,2,0,6,24,60,120,119,⋯,17,37,44,36,39,36,27,34,20,14
0,1,2,0,0,3,5,6,5,4,⋯,1,3,7,4,11,2,0,4,0,0


In [16]:
head(distance_matrix)