feat: mixed_content_url

viasite · Mar 27, 2020 · e058452 · e058452
1 parent 4ef6a8e
commit e058452
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 - Обходит весь сайт, собирает ссылки на страницы и документы
 - Выгружает поля url и глубина, на которой найден url
 - Документы с расширениями `doc`, `docx`, `xls`, `xlsx`, `pdf`, `rar`, `zip` добавляются в список с глубиной 0
+- Поиск страниц с SSL mixed content
 - Каждый сайт сохраняется в файл с именем домена
 - Скрипт не ходит по ссылкам вне сканируемого домена
 - Глубина по умолчанию 10
@@ -16,6 +17,7 @@
 ## Установка:
 ```
 git clone https://github.com/viasite/sites-scraper
+
 cd sites-scraper
 npm install
 npm start

diff --git a/src/scrap-site.js b/src/scrap-site.js
@@ -22,6 +22,7 @@ const fields_presets = {
     'result.canonical',
     'result.is_canonical',
     'previousUrl',
+    'result.mixed_content_url',
     'depth',
     'response.status',
     'result.request_time',
@@ -52,6 +53,7 @@ const fields_presets = {
 
 module.exports = async (baseUrl, options = {}) => {
   const domain = url.parse(baseUrl).hostname;
+  const protocol = url.parse(baseUrl).protocol;
   const FILE = `./data/${domain}.csv`; // файл вывода
   let currentUrl = ''; // для хака с документами
 
@@ -140,11 +142,21 @@ module.exports = async (baseUrl, options = {}) => {
     customCrawl: async (page, crawl) => {
       // You can access the page object before requests
       await page.setRequestInterception(true);
+      await page.setBypassCSP(true);
+
+      let mixedContentUrl = '';
+
       // это событие срабатывает, когда chrome подгружает статику на странице (и саму страницу)
       page.on('request', request => {
         //console.log('request.url(): ', request.url());
 
-        // don't request static
+        // check for mixed content, thanks to https://github.com/busterc/mixed-content-crawler/
+        if (protocol == 'https:' && ['image', 'stylesheet', 'script'].includes(request.resourceType()) && request.url().match(/^http:/)) {
+          request.notHTTPS = true;
+          mixedContentUrl = request.url();
+          return request.abort();
+        }
+
         if (SKIP_IMAGES && request.resourceType() == 'image') {
           request.abort();
         } else if (SKIP_CSS && request.resourceType() == 'stylesheet') {
@@ -156,6 +168,12 @@ module.exports = async (baseUrl, options = {}) => {
         }
       });
 
+      page.on('requestfailed',  request => {
+        if (request.notHTTPS) {
+          console.error('mixed content: ', request.url());
+        }
+      });
+
       // костыль, который возвращает фейково обойдённый документ, если он признан документом
       // нужно, чтобы доки не сканировались (выдают ошибку), но при этом добавлялись в csv
       // т.к. в этом контексте нет текущего урла, он задаётся в глобал через событие requeststarted
@@ -177,13 +195,15 @@ module.exports = async (baseUrl, options = {}) => {
       }
       // The result contains options, links, cookies and etc.
       const result = await crawl();
+
+      result.result.mixed_content_url = mixedContentUrl;
       // You can access the page object after requests
       result.content = await page.content();
       // You need to extend and return the crawled result
       return result;
     }
   };
-  crawlerOptions = { ...defaultOptions, ...options };
+  const crawlerOptions = { ...defaultOptions, ...options };
 
   const start = Date.now();