# Processing Constitution

Processing and getting all the Parts, Chapters and Sections from the Constitution of India

`constitutionofindia.net` contains all the text of the Constitution in a nice structured format

In [1]:
import os
import json
import os
import re
from bs4 import BeautifulSoup as bs
import requests
from collections import defaultdict

In [2]:
base_url = "https://www.constitutionofindia.net/constitution_of_india"

The base URL provides links to all the Parts of the Constitution. The website structure can be used to get the overall document structure.

The base URL also provides URLs for each of the Parts which contains the Articles of that part

In [3]:
response = requests.get(base_url)

In [4]:
response.status_code

200

In [5]:
soup = bs(response.content)

In [6]:
parts = soup.find_all("div", class_="card")

In [7]:
part_info = {}
for part in parts:
    if "Part" in part.text or "Preamble" in part.text:
        label = part.find("div", class_="card-label")
        label = label.text.replace("Part","").replace(" ", "")
        url = part.find("a", href=True)
        url = url["href"]
        part_info[label] = "/".join(url.split("/")[2:])
    if "Schedule" in part.text:
        label = part.find("div", class_="card-label")
        label = label.text.strip("Part").replace(" ", "")
        url = part.find("a", href=True)
        url = url["href"]
        part_info[label] = "/".join(url.split("/")[2:])

In [8]:
part_info

{'Preamble': 'preamble',
 '1': 'the_union_and_its_territory/articles',
 '2': 'citizenship/articles',
 '3': 'fundamental_rights/articles',
 '4': 'directive_principles_of_state_policy/articles',
 '4A': 'fundamental_duties/articles',
 '5': 'the_union/articles',
 '6': 'the_states/articles',
 '7': 'the_states_in_part_b_of_the_first_schedule_/articles',
 '8': 'the_union_territories/articles',
 '9': 'the_panchayats/articles',
 '9A': 'the_municipalities/articles',
 '9B': 'the_cooperative_societies/articles',
 '10': 'the_scheduled_and_tribal_areas/articles',
 '11': 'relations_between_the_union_and_the_states/articles',
 '12': 'finance__property__contracts_and_suits/articles',
 '13': 'trade___commerce_and_intercourse_within_the_territory_of_india/articles',
 '14': 'services_under_the_union_and_the_states/articles',
 '14A': 'tribunals/articles',
 '15': 'elections/articles',
 '16': 'special_provisions_relating_to_certain_classes/articles',
 '17': 'official_language/articles',
 '18': 'emergency_pro

Getting the page information for each Part and Schedule

In [9]:
parts_and_schedules = {}

In [16]:
for ele, link in part_info.items():
    url = base_url + "/" + link
    response = requests.get(url)
    parts_and_schedules[ele] = response.content

In [14]:
print(parts_and_schedules)

{'Preamble': b'<!DOCTYPE html><html><head>\n<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"2230d2f544","applicationID":"138780646","transactionName":"d1xcQRBZCl1cFhxTC1pARlwWQxJYVgpsXwJrWlxRC1c5RUAUVkNLREFXVA9UClQ=","queueTime":14,"applicationTime":24,"agent":""}</script>\n<script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"2230d2f544",applicationID:"138780646"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(){}function i(t,e,n,r){return function(){return s.recordSupportability("API/"+e+"/called"),o(t+e,[u.now

In [31]:
all_data = defaultdict(dict)
article_info = {}

In [32]:
for ele, html in parts_and_schedules.items():
    s = bs(html)
    title = s.find("div", class_="banner-text-wrapper")
    title = title.text.split("-")[-1].strip()
    article_list = s.find("div", class_="row article-box")
    if article_list is None:
        continue
    articles = article_list.find_all("div", class_="card")
    for a in articles:
        label = a.find("div", class_="card-label")
        label = label.text
        print(label)
        href = a.find("a", href=True)
        href = href["href"]
        article_info[label] = {"part": title,
                                "href": base_url + "/" + "/".join(href.split("/")[2:])}

Article 1
Article 2
Article 3
Article 4
Article 5
Article 6
Article 7
Article 8
Article 9
Article 10
Article 11
Article 12
Article 13
Article 14
Article 15
Article 16
Article 17
Article 18
Article 19
Article 20
Article 21
Article 21A
Article 22
Article 23
Article 24
Article 25
Article 26
Article 27
Article 28
Article 29
Article 30
Article 31
Article 31A
Article 31B
Article 31C
Article 31D
Article 32
Article 32A
Article 33
Article 34
Article 35
Article 36
Article 37
Article 38
Article 39
Article 39A
Article 40
Article 41
Article 42
Article 43
Article 43A
Article 44
Article 45
Article 46
Article 47
Article 48
Article 48A
Article 49
Article 50
Article 51
Article 51A
Article 52
Article 53
Article 54
Article 55
Article 56
Article 57
Article 58
Article 59
Article 60
Article 61
Article 62 
Article 63
Article 64
Article 65
Article 66
Article 67
Article 68
Article 69
Article 70
Article 71
Article 72
Article 73
Article 74
Article 75
Article 76
Article 77
Article 78
Article 79
Article 80
Article 

In [33]:
article_info

{'Article 1': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%201'},
 'Article 2': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%202'},
 'Article 3': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%203'},
 'Article 4': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%204'},
 'Article 5': {'part': 'Citizenship',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/citizenship/articles/Article%205'},
 'Article 6': {'part': 'Citizenship',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/citizenship/articles/Article%206'},
 'Article 7': 

In [115]:
x = soup.find("div", class_="row article-box")

In [116]:
x

<div class="row article-box"><div class="col-md-2 col-xs-6 col-md-offset-1"><div class="card"><div class="hero-wrapper"><div class="image constitution-type-image" style="background-image: url(https://cadindia-production.s3.amazonaws.com/uploads/constitution_type/constitution_image/1/Part-_box.png);"></div><div class="card-label">Preamble</div></div><div class="content"><h3>Preamble</h3><div class="read-more"><a href="/constitution_of_india/preamble"><div class="pull-left">Explore</div><div class="fa fa-arrow-right pull-left element-spacer-left-10"></div></a></div></div></div></div><div class="col-md-2 col-xs-6"><div class="card"><div class="hero-wrapper"><div class="image constitution-type-image" style="background-image: url(https://cadindia-production.s3.amazonaws.com/uploads/constitution_type/constitution_image/2/Part-_box.png);"></div><div class="card-label">Part 1</div></div><div class="content"><h3>The Union and its Terri...</h3><div class="read-more"><a href="/constitution_of_ind

In [49]:
for article, content in article_info.items():
    response = requests.get(content["href"])
    s = bs(response.content)
    title = s.find("div", class_="banner-text-wrapper")
    title = title.text.split("-")[-1].strip()
    description = s.find("div", class_="description-wrapper")
    texts = description.find_all("p")
    text = ""
    for line in texts:
        if "Debate" in line:
            break
        text += " " + line.text
    article_info[article].update({
        "title": title,
        "text": text})

In [52]:
for article in article_info:
    article_info[article]["text"] = bytes(article_info[article]["text"], 'utf-8').decode('utf-8', 'ignore')
    article_info[article]["text"] = re.sub("\xa0", "", article_info[article]["text"])
    article_info[article]["text"] = re.sub("\s+", " ", article_info[article]["text"])

In [51]:
from copy import deepcopy
art_info = deepcopy(article_info)

In [53]:
article_info

{'Article 1': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%201',
  'title': 'Name and territory of the Union',
  'text': ' (1) India, that is Bharat, shall be a Union of States. (2) The States and the territories thereof shall be as specified in the First Schedule. (3) The territory of India shall comprise — (a) the territories of the States; (b) the Union territories specified in the First Schedule; and (c) such other territories as may be acquired.'},
 'Article 2': {'part': 'The Union and its Territory',
  'href': 'https://www.constitutionofindia.net/constitution_of_india/the_union_and_its_territory/articles/Article%202',
  'title': 'Admission or establishment of new States',
  'text': ' Parliament may by law admit into the Union, or establish, new States on such terms and conditions as it thinks fit.'},
 'Article 3': {'part': 'The Union and its Territory',
  'href': 'https://

In [55]:
with open("/home/workboots/Datasets/IndiaCode/new/CentralActs/act_chapter_section_info/constitution.json", 'w') as f:
    json.dump(article_info, f, indent=4)