Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

done #4

Closed
wants to merge 76 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
6e2eb01
add demo
kelvv Nov 1, 2018
c8f532f
fix
kelvv Nov 1, 2018
5dc3c30
fix date
kelvv Nov 1, 2018
0e9241a
fix
kelvv Nov 1, 2018
409feaf
fix
kelvv Nov 1, 2018
6423d81
add 错误自修复功能
kelvv Nov 1, 2018
afe61fd
fix
kelvv Nov 1, 2018
e739c43
fix
kelvv Nov 1, 2018
efde2fc
fix
kelvv Nov 1, 2018
bdde8a3
fix
kelvv Nov 1, 2018
f3b1c92
add timeout
kelvv Nov 1, 2018
6ae8c3d
fix
kelvv Nov 1, 2018
b6a8ef0
fix
kelvv Nov 1, 2018
96206db
fix
kelvv Nov 2, 2018
1611571
fix
kelvv Nov 2, 2018
a2c91fc
fix
kelvv Nov 2, 2018
bc57a8d
fix
kelvv Nov 2, 2018
7cfb190
fix
kelvv Nov 2, 2018
77a6f90
fix
kelvv Nov 2, 2018
0f761a6
fix
kelvv Nov 2, 2018
acbbe3a
fix
kelvv Nov 2, 2018
de05753
fix
kelvv Nov 2, 2018
90055cb
fix
kelvv Nov 2, 2018
db57721
fix
kelvv Nov 2, 2018
53b92c9
add cre version
kelvv Nov 2, 2018
b860ddd
fix
kelvv Nov 2, 2018
86ac787
fix
kelvv Nov 2, 2018
473dade
fix
kelvv Nov 2, 2018
c85bf33
fix
kelvv Nov 2, 2018
3368fef
fix
kelvv Nov 2, 2018
b2bbb6b
add mysql
kelvv Nov 4, 2018
56be5b9
fix
kelvv Nov 4, 2018
7591c61
fix
kelvv Nov 4, 2018
8ff0ea5
fix
kelvv Nov 4, 2018
59592ca
fix
kelvv Nov 4, 2018
30e3fdd
fix
kelvv Nov 4, 2018
e00fd26
fix
kelvv Nov 4, 2018
5499cf9
fix
kelvv Nov 4, 2018
de5b1c8
fix
kelvv Nov 4, 2018
fdb88fe
fix
kelvv Nov 4, 2018
eead307
fix
kelvv Nov 4, 2018
f2609c3
fix
kelvv Nov 4, 2018
566fefb
fix
kelvv Nov 4, 2018
bd50f2c
fix
kelvv Nov 4, 2018
acad102
fix
kelvv Nov 4, 2018
787b164
fix
kelvv Nov 4, 2018
73b8e1f
fix
kelvv Nov 4, 2018
d8a71f4
fix
kelvv Nov 4, 2018
03244cd
fix
kelvv Nov 4, 2018
c7f493a
fix
kelvv Nov 4, 2018
c8e9aec
fix
kelvv Nov 4, 2018
486b6b1
fix
kelvv Nov 4, 2018
a699bf4
fix
kelvv Nov 4, 2018
218bb9d
fix
kelvv Nov 4, 2018
76956cb
fix
kelvv Nov 4, 2018
9c98ba3
fix
kelvv Nov 4, 2018
54f20d4
fix
kelvv Nov 4, 2018
18f28c8
fix
kelvv Nov 4, 2018
5ba310c
fix
kelvv Nov 4, 2018
e25926e
fix
kelvv Nov 4, 2018
f19e968
fix
kelvv Nov 4, 2018
4f84a8a
fix
kelvv Nov 4, 2018
d0b2e34
fix
kelvv Nov 4, 2018
97a4b2a
fix
kelvv Nov 4, 2018
d7bae74
fix
kelvv Nov 4, 2018
9c320ce
fix
kelvv Nov 4, 2018
2b68627
fix
kelvv Nov 5, 2018
1a838c9
fix
kelvv Nov 5, 2018
2e31402
dic
kelvv Nov 5, 2018
ecfb4f9
fix else
kelvv Nov 5, 2018
2216824
fix
kelvv Nov 5, 2018
b707cf1
fix
kelvv Nov 5, 2018
da5952d
fix
kelvv Nov 5, 2018
bcbc19c
fix
kelvv Nov 5, 2018
ddb0a72
add timeout proxy
kelvv Nov 5, 2018
5af5566
fix
kelvv Nov 5, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@ typings/

# next.js build output
.next

excels/
db.json
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"type": "node",
"request": "launch",
"name": "Launch Program",
"program": "${workspaceFolder}/index.js"
"program": "${workspaceFolder}/cre.js"
}
]
}
11 changes: 10 additions & 1 deletion config/default.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,14 @@ module.exports = {
proxy:{
orderId: "GL20181101134635BxdyEdia",
secret: "d154fd7868cc58396f52a4d9b758bc51"
}
},
db: {
options: {
host: 'rm-wz92zws0f8x0q9loa0o.mysql.rds.aliyuncs.com',
port: 3306,
user: 'root',
pass: '123456isMs'
},
database: 'yelp'
},
}
123 changes: 123 additions & 0 deletions cre.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
var Crawler = require("crawler");
let xlsxHandler = require('./utils/xlsxHandler')
const Regex = require('regexper.js');
const db = require('./utils/db')
const mysql = require('./utils/mysql')
const moment = require('moment')
const _ = require('lodash')
let webHandler = require('./utils/webHandler')

let proxy = ''

let requestCount = 0
let isRefresh = false

let citys = [
"Chicago",
"Las Vegas",
"Los Angeles",
"New York",
"Orlando"
]

let preRequest = async function(options, done) {
try{
requestCount++
console.log(requestCount)
options.proxy = db.get('proxy.url').value()

if(requestCount>=100||isRefresh||moment(db.get('proxy.time').value()).diff(moment(Date.now()),'minute')<= -5){
requestCount=0
isRefresh=false
await webHandler.RefreshProxy()
}
}catch(err){
console.log(err)
}

done();
}

var commentCraw = new Crawler({
maxConnections: 100,
preRequest: async function(options, done) {
preRequest(options,done)
},
callback :async function (error, res, done) {
requestCount++
if(error){
console.log(error)
}else{
try{
let page =parseInt(/start=(\d*)/.exec(res.options.uri)[1]) / 20
if(page<130){
let url = res.options.uri.replace(/start=\d*/,'start='+((page+1)*20))
console.log('comment自进入:'+ url)
commentCraw.queue({
uri : url
});
}
JSON.parse(res.body)
let businessInfoResult = JSON.parse(res.body)
console.log('得到商铺详情,开始匹配评论')
let regex = new Regex(/dropdown_user-name[^>]+?>([^<]+)[\s\S]+?([\d\.]+)\s*star rating[\s\S]+?rating-qualifier\S+\s*([\d\/]+)[\s\S]+?<p[^>]+>([\s\S]+?<\/p>)/,'ig');
let matches = regex.matches(unescape(businessInfoResult.review_list))
let commentInfos = []
let commentQues = []
for(let match of matches){
if(new Date(match.groups[3])>new Date('10/1/2017')){
let obj = {
Cus_Name : match.groups[1],
Cus_Review_Rate : match.groups[2],
Cus_Review_Date : new Date(match.groups[3]),
Review : match.groups[4],
url : /www.yelp.com([\s\S]+?)\/review_feed/.exec(res.options.uri)[1]
}
commentQues.push(`('${obj.Cus_Name}','${obj.Cus_Review_Rate}','${moment(obj.Cus_Review_Date).format('YYYY-MM-DD')}','${obj.Review}','${obj.url}')`)
}
}
if(commentQues.length>0){
global.sequelize.query(`
INSERT INTO

comment(Cus_Name,Cus_Review_Rate,Cus_Review_Date,Review,url)

VALUES
${commentQues.join(',')}
`)
}
}catch(err){
if(err.toString().indexOf('TimeoutError')>=0){
commentCraw.queue({
uri : res.options.uri
});
}else{
console.log('报错:'+ res.options.uri)
console.log(err)
}
}
}
done()
}
});

async function begin(){
await webHandler.RefreshProxy()
requestCount = 0
proxy = db.get('proxy.url').value()
let business = await mysql.Business.findAll({
attributes:['url']
})
for(let b of business){
commentCraw.queue({
uri: `https://www.yelp.com${b.url}/review_feed?start=0&sort_by=date_desc`
});
}
}



//是否使用代理服务器
begin()


4 changes: 2 additions & 2 deletions db.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"proxy": {
"url": "http://111.176.31.204:37694",
"time": 1541057232731
"url": "http://58.50.2.149:27901",
"time": 1541411052813
}
}
Binary file removed excels/Chicago.xlsx
Binary file not shown.