-
Notifications
You must be signed in to change notification settings - Fork 37
/
skuSpider.js
188 lines (157 loc) · 4.41 KB
/
skuSpider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
'use strict';
const fs = require('fs');
const path = require('path');
const pg = require('promise-generator');
const json2csv = require('json2csv');
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
module.exports = skuSpider;
/**
* [skuSpider description]
* @param {Array} siteConfig 站点配置,见./config/config.site.js
* @return {Object} Pomise
*/
function skuSpider(siteConfig) {
const siteProcess = [];
const fields = siteConfig.fields.map((item)=>{
return {
label: item.name,
value: `skuInfo.${item.key}`,
default: 'NULL'
};
});
return puppeteer.launch()
.then((browser) => {
siteConfig.urls.forEach((url) => {
siteProcess.push(() => {
console.log(`> 开始获取 ${url} 的页面数据...`);
return getSku(browser, url, siteConfig.site);
});
});
return pg(siteProcess)
.then((data) => {
browser.close();
return genCsvData(fields, data);
})
.catch((err) => {
console.error(err);
});
});
}
/**
* 获取详情页的URL列表
* @param {Object} siteInfo 商品列表页配置
* @return {Obejct} Promise
*/
function getSku(browser, url, sites) {
const siteInfo = sites.find((item) => {
if (item.is(url)) {
return item;
}
});
if (!siteInfo) return Promise.resolve({
siteInfo: {},
skuLink: url,
skuInfo: {}
});
return puppeteerHtml(browser, url, siteInfo)
.then((data) => {
let skuInfo = {};
if (data) {
const $ = cheerio.load(data);
siteInfo.skuContentArr.forEach(( item ) => {
skuInfo[item.name] = item(url, $);
});
console.log(`>> 成功获取:${url}`);
} else {
skuInfo = {};
console.error(`>> 获取 ${url} 商品数据失败!`);
}
return {
siteInfo,
skuLink: url,
skuInfo: skuInfo
};
}).catch((data) => {
console.error(`>> 解析 ${url} 页面中的商品数据失败: ${JSON.stringify(data)}`);
return {
siteInfo: siteInfo,
skuLink: url,
skuInfo: {}
};
});
}
/**
* 获取文档的标准HTML
* @param {Object} browser puppeteer的browser对象
* @param {String} url 页面连接
* @return {Object} <Promise<HTML String>>
*/
function puppeteerHtml(browser, url, siteInfo) {
let page;
return browser.newPage()
.then((data) => {
page = data;
// 设置完成请求的时机
if (siteInfo.onDetailPageLoaded) siteInfo.onDetailPageLoaded(url, page);
// 设置超时时间
page.setDefaultNavigationTimeout(10000);
// 设置页面请求可被截取
return page.setRequestInterception(true);
}).then(() => {
page.on('request', interceptedRequest => {
const urlObj = interceptedRequest.url();
if (urlObj.endsWith('.png') || urlObj.endsWith('.jpg'))
interceptedRequest.abort();
else
interceptedRequest.continue();
});
return page.goto(url);
}).then(() => {
return page.screenshot({path:'./test.png', fullPage: true});
}).then(() => {
return page.content();
}).then((data) => {
return page.close()
.then(() => {
return data;
});
})
.catch((err) => {
console.log(err);
return page.close();
});
}
/**
* 通过sku结构化数据生成CSV
* @param {Object} data 商品的结构化数据
* @return
*/
function genCsvData(fields, skuDataLists) {
try {
const content = json2csv({ data: skuDataLists, fields: fields });
const now = new Date();
const nowStr = `${now.getFullYear()}${now.getMonth()+1}${now.getMonth()}${now.getHours()}${now.getMinutes()}${now.getSeconds()}`;
const fileName = path.resolve(`./${nowStr}-${skuDataLists.length}.csv`);
saveCsv(fileName, content);
console.log(`> 成功生成 ${fileName} !`);
return skuDataLists;
} catch (err) {
console.error(err);
return;
}
}
/**
* 保存CSV文件
* @param {String} fileName 文件地址
* @param {String} content 文件内容
* @return
*/
function saveCsv(fileName, content) {
// 防止excel打开乱码: https://github.com/f2e-journey/xueqianban/issues/34
const msExcelBuffer = Buffer.concat([
new Buffer('\xEF\xBB\xBF', 'binary'),
new Buffer(content)
]);
fs.writeFileSync(fileName, msExcelBuffer);
}