Update scraper.php

vikash12345 · Feb 16, 2018 · c14364e · c14364e
1 parent a30d5fd
commit c14364e
Showing 1 changed file with 89 additions and 24 deletions.
diff --git a/scraper.php b/scraper.php
@@ -1,27 +1,92 @@
-<?
+<?php
 // This is a template for a PHP scraper on morph.io (https://morph.io)
 // including some code snippets below that you should find helpful
-
-// require 'scraperwiki.php';
-// require 'scraperwiki/simple_html_dom.php';
-//
-// // Read in a page
-// $html = scraperwiki::scrape("http://foo.com");
-//
-// // Find something on the page using css selectors
-// $dom = new simple_html_dom();
-// $dom->load($html);
-// print_r($dom->find("table.list"));
-//
-// // Write out to the sqlite database using scraperwiki library
-// scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer'));
-//
-// // An arbitrary query against the database
-// scraperwiki::select("* from data where 'name'='peter'")
-
-// You don't have to do things with the ScraperWiki library.
-// You can use whatever libraries you want: https://morph.io/documentation/php
-// All that matters is that your final data is written to an SQLite database
-// called "data.sqlite" in the current working directory which has at least a table
-// called "data".
+require 'scraperwiki.php';
+require 'scraperwiki/simple_html_dom.php';
+//require	'simple_html_dom.php';
+	$browser	=	file_get_html('https://indiankanoon.org/browse');
+	foreach($browser->find("//td/div[@class='browselist']/")as $element)
+	{
+	$page 		=	$element->find("a[plaintext^=Allahabad High Court]",0)->href;
+	$pagetext	=	$element->find("a[plaintext^=Allahabad High Court]",0)->plaintext;
+
+	if($page)
+	{	
+		sleep(5);
+
+		$link	=	'https://indiankanoon.org/'.$page;
+		$pageofyears	=	file_get_html($link);
+		foreach($pageofyears->find("/html/body/div[2]/table/tbody/tr/td/div[@class='browselist']")as $year)
+		{
+			$yearlink	=	$year->find("a",0)->href;
+			$yeartext	=	$year->find("a",0)->plaintext;
+			if($yearlink)
+			{
+				$pagelink		=	 'https://indiankanoon.org'.$yearlink;
+				$openyearpage	=	  file_get_html($pagelink);
+				if($openyearpage)
+				{
+					foreach($openyearpage->find("//td/div[@class='browselist']")as $month)
+					{
+						$monthname	=	$month->find("a",0)->href;
+						$monthtext	=	$month->find("a",0)->plaintext;
+						$correctlink	=	'https://indiankanoon.org'.$monthname;
+						$urlofpage	=	str_replace(" ","%20",$correctlink);
+						$html		=		file_get_html($urlofpage);
+		if($html)
+		{
+			//  Page loaded successfully
+		$RecordLoop =   -1;
+		$RecordFlag =   true;
+		while ($RecordFlag == true) 
+			{
+					$RecordLoop+=  1;
+					$paginationlink		=	$urlofpage.'&pagenum='.$RecordLoop;
+					$mainpageofprofiles 		=	file_get_html($paginationlink);
+					sleep(5);
+					$checkerprofile	=	$mainpageofprofiles->find("/html/body/div/div[3]/form/input[3]",0);
+
+
+
+					if (!$checkerprofile) 
+								{
+									echo "Scraper Inprogress don't stop -> $pagetext\n";
+									$RecordFlag =   false;
+									break;
+								}			
+					foreach($mainpageofprofiles->find("//div/div/div[@class='result']") as $element)
+						{
+							//Name of Case
+							$vsname		=	$element->find("//a[@class='result_url']",0)->plaintext;
+							//Link of Case
+							$lvsname		=	$element->find("//a[@class='result_url']",0)->href;
+							//This is for Name of judicary
+							 $courtname	=	$element->find("div[@class='docsource']",0)->plaintext;
+							//Text of Cite
+							$cite	=	$element->find("a[@class='cite_tag']",0)->plaintext;
+							//Link of Cite
+							$lcite	=	$element->find("a[@class='cite_tag']",0)->href;
+							//This is for Full Document	
+							$fulldocument	=	$element->find("//a[plaintext^=Full Document]", 0)->href;
+
+						//  End if nor more records
+							 $record = array( 'vsname' =>$vsname,
+									 'link' =>$link,
+									 'pagelink' => $pagelink,
+									 'urlofpage' => $urlofpage,
+									 'lvsname' =>$lvsname,
+									 'courtname' =>$courtname,
+									 'cite' =>$cite,
+									 'lcite' =>$lcite,
+									 'paginationlink' =>$paginationlink);
+		  scraperwiki::save(array('vsname','link','pagelink','urlofpage','lvsname','courtname','cite','lcite','paginationlink'), $record); 
+						}
+			}
+		}
+					}
+				}
+			}
+		}
+	}
+	}
 ?>