Skip to content

Commit

Permalink
Add CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
vifreefly committed Aug 22, 2018
1 parent de934ec commit 57ef608
Show file tree
Hide file tree
Showing 30 changed files with 917 additions and 21 deletions.
5 changes: 4 additions & 1 deletion exe/kimurai
@@ -1,3 +1,6 @@
#!/usr/bin/env ruby

require "kimurai"
require 'kimurai'
require 'kimurai/cli'

Kimurai::CLI.start(ARGV)
16 changes: 14 additions & 2 deletions lib/kimurai.rb
@@ -1,6 +1,20 @@
require 'ostruct'
require 'logger'
require 'active_support'
require 'active_support/core_ext'
require 'rbcat'

require_relative 'kimurai/version'

require_relative 'kimurai/core_ext/numeric'
require_relative 'kimurai/core_ext/string'
require_relative 'kimurai/core_ext/array'

require_relative 'kimurai/browser_builder'
require_relative 'kimurai/base_helper'
require_relative 'kimurai/pipeline'
require_relative 'kimurai/base'

module Kimurai
class << self
def configuration
Expand Down Expand Up @@ -36,5 +50,3 @@ def find_by_name(name)
end
end
end

# require_relative 'kimurai/default_configuration'
14 changes: 0 additions & 14 deletions lib/kimurai/all.rb

This file was deleted.

54 changes: 54 additions & 0 deletions lib/kimurai/automation/deploy.yml
@@ -0,0 +1,54 @@
---
- hosts: all
vars:
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
repo_url:
repo_name:
repo_key_path:

tasks:
- name: Copy custom git ssh key to /tmp/private_key (if provided)
when: repo_key_path is not none
copy:
src: "{{ repo_key_path }}"
dest: /tmp/private_key
mode: 0600

- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
when: repo_key_path is none
git:
repo: "{{ repo_url }}"
dest: "~/{{ repo_name }}"
force: true
accept_hostkey: true

- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
when: repo_key_path is not none
git:
repo: "{{ repo_url }}"
dest: "~/{{ repo_name }}"
force: true
accept_hostkey: true
key_file: /tmp/private_key

- name: Delete custom git ssh key from /tmp/private_key (if provided)
when: repo_key_path is not none
file:
state: absent
path: /tmp/private_key

- name: Run bundle install
command: bundle install
args:
chdir: ~/{{ repo_name }}
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Run whenever to update crontab
command: whenever --update-crontab
args:
chdir: ~/{{ repo_name }}
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

44 changes: 44 additions & 0 deletions lib/kimurai/automation/setup.yml
@@ -0,0 +1,44 @@
---
- hosts: all
vars:
ruby: 2.5.1
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
ruby_versions_path: "{{ rbenv_root_path }}/versions"
# check latest here http://phantomjs.org/download.html
phantomjs: 2.1.1
# check latest here https://github.com/mozilla/geckodriver/releases/
geckodriver: 0.21.0
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
chromedriver: 2.39

tasks:
- name: Update apt cache
become: true
apt: update_cache=yes cache_valid_time=86400

- name: Install base packages
become: true
apt:
pkg: "{{ item }}"
state: present
with_items:
- xvfb
- libsqlite3-dev
- sqlite3
- mongodb-clients
- mysql-client
- libmysqlclient-dev
- postgresql-client
- libpq-dev

- import_tasks: setup/ruby_environment.yml

- import_tasks: setup/phantomjs.yml
become: true

- import_tasks: setup/firefox_geckodriver.yml
become: true

- import_tasks: setup/chromium_chromedriver.yml
become: true
26 changes: 26 additions & 0 deletions lib/kimurai/automation/setup/chromium_chromedriver.yml
@@ -0,0 +1,26 @@
---
- name: Install chromium browser
apt:
pkg: chromium-browser
state: present

- name: Get current chromedriver version
shell: chromedriver --version
args:
executable: /bin/bash
register: current_chromedriver_version
changed_when: false
ignore_errors: true

- name: Install unzip tool to unarchive chromedriver archive
apt:
pkg: unzip
state: present

- name: Download chromedriver binary archive and unarchive it to /usr/local/bin
unarchive:
src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
dest: /usr/local/bin
remote_src: true
mode: a+x
when: chromedriver not in current_chromedriver_version.stdout_lines
20 changes: 20 additions & 0 deletions lib/kimurai/automation/setup/firefox_geckodriver.yml
@@ -0,0 +1,20 @@
---
- name: Install firefox
apt:
pkg: firefox
state: present

- name: Get current geckodriver version
shell: geckodriver --version
args:
executable: /bin/bash
register: current_geckodriver_version
changed_when: false
ignore_errors: true

- name: Download geckodriver binary archive and unarchive it to /usr/local/bin
unarchive:
src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
dest: /usr/local/bin
remote_src: true
when: geckodriver not in current_geckodriver_version.stdout
33 changes: 33 additions & 0 deletions lib/kimurai/automation/setup/phantomjs.yml
@@ -0,0 +1,33 @@
---
- name: Install dependencies for PhantomJS
apt:
pkg: "{{ item }}"
state: present
with_items:
- chrpath
- libxft-dev
- libfreetype6
- libfreetype6-dev
- libfontconfig1
- libfontconfig1-dev

- name: Get current phantomjs version
shell: phantomjs -v
args:
executable: /bin/bash
register: current_phantomjs_version
changed_when: false
ignore_errors: true

- name: Download phantomJS archive and unarchive it to /usr/local/lib
unarchive:
src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
dest: /usr/local/lib
remote_src: true
when: phantomjs not in current_phantomjs_version.stdout

- name: Link PhantomJS binary to /usr/local/bin/phantomjs
file:
src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
dest: /usr/local/bin/phantomjs
state: link
124 changes: 124 additions & 0 deletions lib/kimurai/automation/setup/ruby_environment.yml
@@ -0,0 +1,124 @@
---
- name: Install dependencies for ruby-build
become: true
apt:
pkg: "{{ item }}"
state: present
with_items:
- zlib1g-dev
- build-essential
- libssl-dev
- libreadline-dev
- libreadline6-dev
- libyaml-dev
- libxml2-dev
- libxslt1-dev
- libcurl4-openssl-dev
- libffi-dev

- name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
git:
repo: https://github.com/sstephenson/rbenv.git
dest: "{{ rbenv_root_path }}"

- name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
git:
repo: https://github.com/sstephenson/ruby-build.git
dest: "{{ rbenv_root_path }}/plugins/ruby-build"

- name: Add Rbenv path to the .bashrc
lineinfile:
dest: ~/.bashrc
regexp: '^export PATH="\$HOME\/\.rbenv'
line: export PATH="$HOME/.rbenv/bin:$PATH"
state: present

- name: Add Rbenv init to the .bashrc
lineinfile:
dest: ~/.bashrc
regexp: '^eval "\$\(rbenv'
line: eval "$(rbenv init -)"
state: present

- name: Check if desired Ruby version already installed
stat:
path: "{{ ruby_versions_path }}/{{ ruby }}"
register: ruby_present

- name: Install desired Ruby version using ruby-build (this can take a while)
command: rbenv install {{ ruby }}
when: not ruby_present.stat.exists
environment:
CONFIGURE_OPTS: "--disable-install-doc"
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Get current Ruby version
command: "ruby -v"
register: current_ruby_version
changed_when: false
ignore_errors: true
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Set desired Ruby version as a global version
command: "rbenv global {{ ruby }}"
when: ruby not in current_ruby_version.stdout
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
register: set_ruby

- name: Execute `rbenv rehash` command
command: rbenv rehash
when: set_ruby.changed
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Create ~/.gemrc file to skip docs
copy:
dest: ~/.gemrc
content: "gem: --no-ri --no-rdoc"

- name: Create ~/.bundle directory
file:
dest: ~/.bundle
state: directory

- name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
copy:
dest: ~/.bundle/config
content: |
BUNDLE_GIT__ALLOW_INSECURE: "true"
BUNDLE_JOBS: "4"
- name: Check if Bundler gem installed
stat:
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
register: bundler_gem_present

- name: Install Bundler gem
command: gem install bundler
when: not bundler_gem_present.stat.exists
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Check if Whenever gem installed
stat:
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
register: whenever_gem_present

- name: Install Whenever gem
command: gem install whenever
when: not whenever_gem_present.stat.exists
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"

- name: Check if Kimurai gem installed
stat:
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
register: kimurai_gem_present

- name: Install Kimurai gem
command: gem install kimurai
when: not kimurai_gem_present.stat.exists
environment:
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
7 changes: 3 additions & 4 deletions lib/kimurai/base.rb
@@ -1,4 +1,3 @@
require 'rbcat'
require_relative 'base/simple_saver'
require_relative 'base/uniq_checker'

Expand Down Expand Up @@ -151,9 +150,9 @@ def self.crawl!
@run_info, @checker, @saver = nil
end

def self.parse!(handler, url: nil, data: {})
spider = self.new
url ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
def self.parse!(handler, engine = nil, url: nil, data: {})
spider = engine ? self.new(engine) : self.new
url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
ensure
spider.browser.destroy_driver!
end
Expand Down

0 comments on commit 57ef608

Please sign in to comment.