-
Notifications
You must be signed in to change notification settings - Fork 5
/
scrape.coffee
executable file
·177 lines (145 loc) · 6.24 KB
/
scrape.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# fb-scraper
#
# IMPORTANT: run casperjs with --ignore-ssl-errors=yes --cookies-file=cookies.txt
_env = require("system").env
fs = require("fs")
casper = require("casper").create({
# verbose: true, # useful for debug
logLevel: "debug",
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22",
pageSettings: { # Save resources (set to true if you'd like to take screenshots)
loadImages: false,
loadPlugins: false
}
})
cheerio = require("cheerio") # Library to parse HTML with a jQuery-like API
# Check that credentials exist
unless _env.hasOwnProperty("fb_user") and _env.hasOwnProperty("fb_pass")
casper.echo "Missing environment variables. Do `source .env` first."
casper.exit()
# Define the user to fetch
user = if casper.cli.has(0) then casper.cli.get(0) else "zuck"
filePath = "parsed/#{user}.json"
skipScraping = casper.cli.has("parse-only")
# @todo: --parse-only is implemented in a very hacky, last-minute way
# and should be just a big if-else with better separated functions,
# at the moment it runs Casper for nothing.
# Also the 'parser' of posts should be a class/single function.
###*
* This transforms elements of a single post into a JSON object
* @param {object} item The current status item
* @return {object}
###
parseFacebookPost = (item) ->
###*
* Count number of shares on a post
* @param {Cheerio} el The full status element
* @return {int} Number of shares
###
countShares = (el) ->
return 0 if el('.UFIShareLink').length is 0
return parseInt(el('.UFIShareLink').first().text().match(/[0-9]+/), 10)
###*
* Count number of likes on a post
* @param {Cheerio} el The full status element
* @return {int} Number of likes
###
countLikes = (el) ->
return 0 if el('._1g5v').length is 0
return parseInt(el('._1g5v').text().match(/[0-9]+/), 10)
###*
* Count number of comments on a post
* NOTE: at the moment this doesn't count comment replies on purpose
* if you do want to count them... exercise left to the reader!
*
* @param {Cheerio} el The full status element
* @return {int} Number of comments
###
countComments = (el) ->
total = 0
total += el('span.UFICommentBody').length # comment blocks
unless el('.UFIPagerRow').length is 0 # comment pager ("show 12 more comments")
total += parseInt(el('.UFIPagerRow').first().text().match(/[0-9]+/), 10)
return total
if !item.html then return null
$ = cheerio.load(item.html)
# Determine whether the post contains a link/photo, and if it has any textual content
# If yes, just skip it (this can be modified if you want to keep it)
if $('.userContent').first().text() == "" or $('.mtm').length > 0
return null
return {
content: $('.userContent').first().text()
permalink: $('abbr').first().parents('a').attr('href')
time: $('abbr').first().text()
timestamp: $('abbr').first().data('utime')
likes: countLikes($)
shares: countShares($)
comments: countComments($)
isFriendPost: $('.mhs').length > 0
}
# Let's try to authenticate first
casper.start "https://www.facebook.com", ->
if skipScraping then return
pageTitle = @getTitle()
# note: you may have to change this if your locale isn't English
if pageTitle is "Facebook - Log In or Sign Up"
casper.echo "Attempting to log in..."
query =
email: _env.fb_user
pass: _env.fb_pass
@fill "#login_form", query, true
# Because we keep cookies, you might remain logged in from PhantomJS
else if pageTitle is "Facebook"
casper.echo "Already logged in"
else
casper.echo "Oops, something unexpected happened. Page title: #{pageTitle}"
casper.exit()
# else if @getTitle() is "Redirecting..."
# casper.echo "Logged in"
# Once we're logged in, we move on to the profile
casper.thenOpen "https://www.facebook.com/#{ user }"
currentPage = 1
hasClickedAllStories = false
casper.then ->
if skipScraping then return
casper.echo "Now on https://www.facebook.com/#{ user }"
casper.echo @getTitle()
# Recursive function that keeps scrolling down
tryAndScroll = ->
casper.waitFor ->
casper.scrollToBottom()
true
, ->
unless hasClickedAllStories
# Click on Visible Highlights to show all stories
if casper.visible '#u_jsonp_6_4'
casper.click '#u_jsonp_6_4'
casper.echo '[clicked Visible Highlights]'
hasClickedAllStories = true
# When we see the "Born" block, then we'll stop. Until then keep scrolling!
# @todo: sometimes it never shows and the script keeps chugging along merrily on inexistent pages. The current terrible fix is to stop it regardless at 150 pages but it should just check if there's nothing new that was added.
unless currentPage > 150 or casper.visible { type: "xpath", path: "//a[@class and starts-with(.,'Born')]" }
casper.echo "Loaded page #{ currentPage++ }"
tryAndScroll()
tryAndScroll()
# Once the first part has finished: we have reached the bottom of the page,
# so we take all the elements in the page, parse them & save them
casper.then ->
casper.echo "Reached end of profile, parsing and saving to #{ filePath }"
# take all the <div> with class .userContentWrapper on our big page
# and store them into elements, then save that file
if !skipScraping
elements = @getElementsInfo '.userContentWrapper'
fs.write(filePath + ".raw", JSON.stringify(elements))
else # or load it (if --parse-only)
elements = JSON.parse(fs.read(filePath + ".raw"))
# Then one by one we'll run our parseFacebookPost() function on every div
# and add it to an array
parsedPosts = []
for key, item of elements
if (p = parseFacebookPost(item)) isnt null
parsedPosts.push p
# And we write our array to a file.
fs.write(filePath, JSON.stringify(parsedPosts), "w")
casper.echo "Done!"
casper.run()