-
Notifications
You must be signed in to change notification settings - Fork 22
/
shanghai.R
272 lines (219 loc) · 9.5 KB
/
shanghai.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
install.packages('twitteR')
library(twitteR)
library(ggplot2)
# 搜索某个关键词的推文(关于moyan的发推用户和内容) --------------------------------------------------------------
# 观察哪些用户使用了这个关键词
moyanTweets <- searchTwitter('#moyan', n = 1000)
moyandf <- twListToDF(rdmTweets)
#下一步提取推文用户名并制成频数表
counts <- table(moyandf$screenName)
cc <- data.frame(subset(counts, counts>5))
names(cc) <- 'value'
cc <- data.frame(value = cc$value,name = rownames(cc))
newname <- with(cc, reorder(name, value))
data <- data.frame(cc, newname)
#载入ggplot2包,绘制条形图。
p <- ggplot(data,aes(y=value,x=newname))
p+geom_bar(stat='identity',aes(fill=value))+coord_flip()
ggsave('1.png')
# simopieranni是驻北京的一位意大利记者 SCMP_news南华早报
# 用词云研究推文中的内容
library(tm)
library(wordcloud)
#为了回避一些推文中的网址等符号,用gsub加以去除
moyantext <- moyandf$text
pattern <- "http[^ ]+|RT |@[^ ]+"
text <- gsub(pattern, "", moyantext)
# 再用tm包建立文本库和词频矩阵
tw.corpus <- Corpus(VectorSource(text))
tw.corpus <- tm_map(tw.corpus, stripWhitespace)
tw.corpus <- tm_map(tw.corpus, removePunctuation)
tw.corpus <- tm_map(tw.corpus,tolower)
tw.corpus <- tm_map(tw.corpus,removeWords,stopwords('english'))
doc.matrix <- TermDocumentMatrix(tw.corpus,control = list(minWordLength = 1))
dm <- as.matrix(doc.matrix)
v <- sort(rowSums(dm),decreasing=T)
d <- data.frame(word=names(v),freq=v)
#去除moyan和nobel这两个词后,生成最后的词云
worddata <- d[3:50,]
worddata$word <- factor(worddata$word)
mycolors <- colorRampPalette(c("white","red4"))(200)
wc <-wordcloud(worddata$word,worddata$freq,min.freq=13,colors=mycolors[100:200])
# 分析这些推文中的情绪
#从这个地址将包含正面和负面情绪词汇的文本包下载到本地 http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
pos <- scan('positive-words.txt',what='character',comment.char=';')
neg <- scan('negative-words.txt',what='character',comment.char=';')
# score.sentiment 函数
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
#最后利用score.sentiment函数将推文与情绪文本进行比对,结果存于result变量中,其中score保存着各推文出现代表不同情绪的词频数。score为正表示正面情绪,为负表示负面情绪。
result <- score.sentiment(text,pos,neg)
sentscore <- result$score
scoretab <- as.data.frame(table(factor(sentscore)))
p <- ggplot(scoretab,aes(y=Freq,x=Var1))
p+geom_bar(stat='identity',aes(fill=Freq))+
labs(x='推文情绪值',y='频数')+
theme(legend.position='top')
ggsave('6.png')
# 分析某个用户的发推内容,提取数据并统计(PM2.5) ----------------------------------------------
library(plyr)
# 抓取北京和上海空气数据的推文
airb <- userTimeline("beijingair", n=1000)
airs <- userTimeline("CGShanghaiAir", n=1000)
#提取文本后用正则表达式分割
pattern <- '; | \\(|\\)'
extract <- function(x) {
strsplit(x$text,pattern)
}
textb <- sapply(airb,extract)
texts <- sapply(airs,extract)
#转成数据框格式
datab <-ldply(textb,.fun=function(x) as.data.frame(t(x),stringsAsFactors=F))
datab <- datab[,c(1,4,5)]
datas <-ldply(texts,.fun=function(x) as.data.frame(t(x),stringsAsFactors=F))
datas <- datas[,c(1,4,5)]
# 合并数据,并转换AQI为数值
data <- rbind(datab,datas)
names(data) <- c('time','AQI','type')
data$AQI <- as.numeric(data$AQI)
# 加入城市变量
city <- factor(rep(1:2,each=1000),labels = c('北京','上海'))
data$city <- city
# 加入星期变量
time.date <-as.Date(data$time,"%m-%d-%Y")
data$week = as.POSIXlt(time.date)$wday
# 加入钟点变量
data$clock <- factor(substr(data$time,start=12,stop=13))
# 小提琴图观察不同城市的空气质量
p1 <- ggplot(data,aes(city,AQI,fill=city))
p1 + geom_violin(alpha=0.3,width=0.3) +
geom_jitter(alpha=0.3,shape=21) +
geom_hline(y=c(100,200,300),color='grey20',linetype=2)+
theme(legend.position='none')
ggsave('3.png')
# 上海和北京AQI的分布比较?
p <- ggplot(data,aes(x=AQI,group=city))
p + geom_histogram(aes(fill=city,y=..density..),
alpha=0.5,color='black')+
stat_density(geom='line',position='identity',
size=0.9,alpha=0.5,color='black')+
scale_fill_brewer(palette='Set3')+
facet_wrap(~city,ncol=1)+
theme(legend.position='none')
ggsave('4.png')
# 观察一周的不同日子空气质量中位数
# aggregate(data$AQI,list(data$week),mean,na.rm=T)
p3 <- ggplot(data,aes(factor(week),AQI,colour=city,group=city))
p3 +stat_summary(fun.y = median, geom='line',size=1.2,aes(linetype=city))+
stat_summary(fun.y = median, geom='point',size=4,aes(shape=city))+
coord_cartesian(xlim=c(1,7)) +
theme_bw() +
theme(legend.position=c(0.9,0.9))+
labs(x='星期', y='AQI')
ggsave('5.png')
# 观察不同时点空气质量中位数
p2 <- ggplot(data,aes(clock,AQI,colour=city,group=city))
p2 +stat_summary(fun.y = median, geom='line',size=1.2) +
stat_summary(fun.y = median, geom='point',size=3)+
coord_cartesian(xlim=c(3,26))
# 对某个用户的研究 ----------------------------------------------------------------
#获取用户信息
myid <- getUser('xccds')
myid$name
myid$lastStatus
myid$description
myid$statusesCount
myid$followersCount
myid$friendsCount
myid$created
# 研究某人lady.gaga的粉丝特点 ---------------------------------------------------------------
ladygaga <- getUser('ladygaga')
follow.lady <- ladygaga$getFollowers(n=5000)
df.lady <- do.call('rbind',lapply(follow.lady,as.data.frame))
p <- ggplot(data=df.lady,aes(x=friendsCount,y=followersCount))
p + geom_point(aes(size=statusesCount),color='red4',alpha=0.5)+
scale_x_log10(breaks=c(10,100,1000,5000))+
scale_y_log10(breaks=c(10,100,1000,1000,20000))+
scale_size('发推数',range=c(1,12))+
labs(x='关注对象',y='粉丝数')+
theme_bw()+
theme(legend.position=c(0.9,0.2))
ggsave('8.png')
df.lady$time <- as.Date(df.lady$created)
df.lady$ntime <- as.numeric(df.lady$time)
library(mgcv)
model <- gam(followersCount~s(friendsCount)+s(statusesCount),data=df.lady)
par(mfrow=c(1,2))
plot(model,se=T)
# 研究推特趋势 ------------------------------------------------------------------
yesterdayTrends <- getTrends('daily', date=as.character(Sys.Date()-1))
yesterdayTrends
# 研究上推设备 ------------------------------------------------------------------
sources <- sapply(moyanTweets, function(x) x$getStatusSource())
sources <- str_extract(sources, ">.+<")
sources <- str_replace_all(sources, ">|<", "")
sources <- str_replace(sources,'BlackBerry.+','Blackberry')
counts <- as.data.frame(table(sources))
counts <- subset(counts,Freq>20)
counts$sources <- factor(counts$sources)
#载入ggplot2包,绘制条形图。
p <- ggplot(counts,aes(y=Freq,x=sources))
p + geom_bar(aes(fill=Freq),color='black',stat='identity') +
scale_x_discrete(limits = levels(counts$sources)[order(counts$Freq)])+
geom_text(aes(y=Freq+40,label=paste(Freq/10,'%',sep='')))+
coord_flip() + theme(legend.position='none')+
labs(y='频数',x='设备')
ggsave('7.png')
# 收集xccds的一千条推文
library(twitteR)
xccds <- userTimeline("xccds", n=1000)
xccds[[1]]$getCreated()
extracttime <- function(x) {
return(x$getCreated())
}
xccds.time <- lapply(xccds,extracttime)
library(lubridate)
# 转成本时区
timefunc <- function(x) {
return(with_tz(x,tzone='asia/shanghai'))
}
xtime <- ldply(xccds.time, .fun=timefunc)
xtime$hour <- factor(hour(xtime$V1))
xtime$week <- factor(wday(xtime$V1))
xtimedf <- as.data.frame(table(xtime$hour))
p <- ggplot(xtimedf,aes(x=Var1,y=Freq))
p+geom_bar(stat='identity',aes(fill=Freq))+
theme(legend.position='none')+
labs(x='时刻',y='频数')
ggsave('10.png')