forked from ashwanthkumar/scraphp
/
configuration.php
136 lines (134 loc) · 4.77 KB
/
configuration.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
<?php
if(!defined('SCRAPPER_INCLUDE')) die('I\'m sorry but current you can\'t use this file outside the Scraph application');
/**
* Define the configuration for price scrapper application
*
* @author Ashwanth Kumar <ashwanth@ashwanthkumar.in>
*/
return array(
/**
* Defining the Price Scrapper datastore configuration. Price scrapper uses
* PDO as its abstraction layer for accessing the datastore.
*
* @optinal Leaving this blank will use sqlite db named "scrapper.db" in
* data folder at $SCRAPPER_ROOT path.
*/
'_datastore' => array(
'dsn' => 'mysql:host=localhost;dbname=price',
'user' => 'root',
'pass' => '',
),
'_default_datastore' => array(
'dsn' => 'sqlite:./data/scrapper.db',
'user'=> 'ashwanth',
'pass'=> 'password',
),
/**
* Enable the verbose mode in displaying the data.
* WARNING: This displays a lot of stupid stuff really
* @optional Enabled by default
*/
'_verbose' => true,
/**
* User Agent to be used with the Spider while crawling the data
*
* @optional Default value is "Price Scrapper Spider"
*/
'user_agent' => "Ashwanth Kumar <ashwanth@ashwanthkumar.in> Hacking Challenge",
'socket_timeout' => 10,
/**
* Contains the array of keys used to define new sites for the spider to fetch the data from
*
* @required
*/
'sites' => array(
/**
* Define a custom sites key here.
*
* @required
*/
'agmarket' => array(
'base_url' => "http://agmarknet.nic.in/rep1Newx1_today.asp",
/**
* Adds a perl-compatible regular expression (PCRE) to the list of rules that
* decide which URLs found on a page should be followd explicitly.
* @optional
*/
// 'validMatch' => '', // Uncomment for usage
/**
* Adds a perl-compatible regular expression (PCRE) to the list of rules that
* decide which URLs found on a page should be ignored by the crawler.
* @optional Follows 'validMatch' if present or crawls all of them
*/
// 'nonValidMatch' => '', // Uncomment for usage
/**
* Sets the limit of pages/files the crawler should crawl. If the limit is
* reached, the crawler stops the crawling-process.
* @optional The default-value is 0 (no limit).
*/
'pageLimit' => 1,
/**
* Items that you want to extract from the site.
*
* @TODO Currently we extract only one set of item (may be right to
* call it as object?) from the page.
*
* @optional Leaving this blank does not extract any content. Instead
* the visited URLs are logged into the database.
*/
"items" => array(
/**
* Bean that will take of persisting the items from the site. This bean (class) has to implement
* Scrapable Interface which will take care of any kind of CRUD operations with the items obtained.
*
* There is also a default implementation provided called - "Scrapper" which will write the items
* to sqlite db called "scrapper.db". This db can be found in the data folder at $SCRAPPER_ROOT
* path.
*
* @optional If this attribute is absent Scrapper is implicitily used.
*/
"_bean" => "AGMarket",
/**
* XPath used for the finding the base content attribute to extract the content.
* All items will use this as the base node or contextual node for extracting their content.
*
* @TODO When this XPath does not provide any items, need to log the result and alert the admin
* as there is possibility of site structure being modified.
*
* @TODO I just can't get the libxml implemenatation of XPath working in PHP, so we are going to queue
* the page URLs in the datastore and spawn another process to extract the content out of it using
* YQL. This may sound creepy, but thats the best workaround I can get right now, until I get into
* the core of libxml implementation in PHP 5.3.2(-ubuntu).
*
* @optional If this value is absent the content used
*/
"_base_xpath" => "//table/tr[position() > 7]",
/**
* Contains the list of properties that need to extracted from the URL
*
* Every item that needs to be extracted has to have 2 attributes associated with it.
* 1. xpath => XPath Query used to identify the object in the page.
* @required
*
*/
"props" => array(
"name" => array(
"xpath" => "td[1]",
),
"max_price" => array( /* Max price is stored with the tag value in price as 1 */
"xpath" => "td[2]",
),
"location1" => array( /* Maps location containing max_price */
"xpath" => "td[2]/a/@title",
),
"min_price" => array( /* Min price is stored with the tag value in price as 2 */
"xpath" => "td[3]",
),
"location2" => array( /* Maps location containing min_price */
"xpath" => "td[3]/a/@title",
),
),
),
),
),
);