<a href="https://colab.research.google.com/github/vishnupancharatnala/Rebit/blob/main/NLP_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup

class Website:
    """
    A utility class to represent a website that we have scraped
    """

    url:str # stores website url
    title:str # stores website title
    text:str # store webpage's main text content

    def __init__(self,url): # constructor takes url parameter

        '''
        Create this website object from the given url using Beautiful Soup
        '''
        self.url = url
        response = requests.get(url) # make HTTP GET request to fetch webpage content of the url
        soup = BeautifulSoup(response.content, 'html.parser')
        # creates beautiful soup object to parse HTML content, easy to navigate the HTML structure
        self.title = soup.title.string if soup.title else "No title"
        # soup.title finds the <title> tag
        # .string gets its text content
        # If there's no title, defaults to "No title"

        for irrelevant in soup.body(["script","img","input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(strip=True)


In [2]:
xy = Website("https://www.rbi.org.in/")
print(xy.title)

# extract text

print(xy.text)


	Reserve Bank of India

Skip to main content|LanguageहिंदीHomeAbout UsNotificationsPress ReleasesSpeeches & Media InteractionsSpeechesMedia InteractionsMemorial LecturesPublicationsAnnualHalf-YearlyQuarterlyBi-monthlyMonthlyWeeklyOccasionalReportsWorking PapersLegal FrameworkActRulesRegulationsSchemesResearchExternal Research SchemesRBI Occasional PapersWorking PapersRBI BulletinHistoryDRG StudiesKLEMSState Statistics and FinancesStatisticsData ReleasesDatabase on Indian EconomyPublic Debt StatisticsRegulatory ReportingList of ReturnsData DefinitionValidation rules/ TaxonomyList of RBI Reporting PortalsFAQs of RBI Reporting PortalsPREAMBLE“to regulate the issue of Bank notes and keeping of reserves with a view to securing
                    monetary stability in India and generally to operate the currency and credit system
                    of the country to its advantage; to have a modern monetary policy framework to meet
                    the challenge of an increasingly c

In [3]:
import spacy
import pandas as pd
import numpy as np

In [4]:
nlp = spacy.blank('en')

doc = nlp(xy.text)

In [5]:
for token in doc[:50]:
  print(token)

Skip
to
main
content|LanguageहिंदीHomeAbout
UsNotificationsPress
ReleasesSpeeches
&
Media
InteractionsSpeechesMedia
InteractionsMemorial
LecturesPublicationsAnnualHalf
-
YearlyQuarterlyBi
-
monthlyMonthlyWeeklyOccasionalReportsWorking
PapersLegal
FrameworkActRulesRegulationsSchemesResearchExternal
Research
SchemesRBI
Occasional
PapersWorking
PapersRBI
BulletinHistoryDRG
StudiesKLEMSState
Statistics
and
FinancesStatisticsData
ReleasesDatabase
on
Indian
EconomyPublic
Debt
StatisticsRegulatory
ReportingList
of
ReturnsData
DefinitionValidation
rules/
TaxonomyList
of
RBI
Reporting
PortalsFAQs
of
RBI
Reporting
PortalsPREAMBLE“to
regulate
the
issue


In [6]:
# creating the tokens data frame
li = ['alphabet','number','currency','punctuation']

def function(token):
  res = [0,0,0,0]
  if token.is_alpha:
    res[0] = 1
  if token.like_num:
    res[1] = 1
  if token.is_currency:
    res[2] = 1
  if token.is_punct:
    res[3] = 1
  return res

df = list(map(function,doc))

df = pd.DataFrame(df,columns=li)


In [7]:
df

Unnamed: 0,alphabet,number,currency,punctuation
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,0,0,0,0
4,1,0,0,0
...,...,...,...,...
2124,1,0,0,0
2125,1,0,0,0
2126,1,0,0,0
2127,1,0,0,0


In [8]:
token = [token for token in doc]
token = np.array(token)
df.index = token

In [9]:
df

Unnamed: 0,alphabet,number,currency,punctuation
Skip,1,0,0,0
to,1,0,0,0
main,1,0,0,0
content|LanguageहिंदीHomeAbout,0,0,0,0
UsNotificationsPress,1,0,0,0
...,...,...,...,...
of,1,0,0,0
CommunicationReserve,1,0,0,0
Bank,1,0,0,0
of,1,0,0,0


In [10]:
df.shape

(2129, 4)

In [11]:
# creating vocubulary
dic = set()
for token in doc:
  if token.text not in dic:
    dic.add(token)



# dic = np.array(dic)
dic

{
                     ,
 under,
 well,
 
                     ,
 -,
 Banks,
 of,
 ReleasesSpeeches,
 ),
 and,
 ’s,
 2020,
 #,
 of,
 re,
 MemosHistory,
 Framework,
 -,
 UTsConversion,
 Rate,
 the,
 -,
 Devices,
 Act-,
 Agriculture,
 Monetary,
 Standards,
 With,
 media,
 amount,
 %,
 SchemesRBI,
 2020,
 Package,
 RecordsCo,
 at,
 Targeted,
 ),
 maintain,
 Reserve,
 31,
 Resolution,
 the,
 “,
 6.75%Fixed,
 Classification,
 bifurcated,
 Hindi,
 on,
 monthly,
 Regulatory,
 Fisheries,
 Disclosure,
 advances,
 India,
 %,
 7,
 will,
 -,
 (,
 Inclusive,
 Way,
 ”1.CurrentRatesPolicy,
 Crop,
 features,
 and,
 Lecture,
 FinalRBI,
 of,
 least,
 Subvention,
 public,
 Committee,
 %,
 Regulatory,
 shipment,
 browsers,
 BankingOverviewSustainable,
 by,
 -,
 
                     ,
 yourselves,
 of,
 -,
 Term,
 @,
 Resolution,
 Extended,
 site,
 Century,
 Operations,
 Meeting,
 -,
 Term,
 /,
 M.,
 (,
 Resolution,
 operate,
 url,
 III,
 of,
 Term,
 
                     ,
 while,
 and,
 the,
 Minister,


In [12]:
df2 = list(map(function,dic))

df2 = pd.DataFrame(df2,columns=li)

ind = np.array(list(dic))
df2.index = ind

In [13]:
df2

Unnamed: 0,alphabet,number,currency,punctuation
\r\n,0,0,0,0
under,1,0,0,0
well,1,0,0,0
\r\n,0,0,0,0
-,0,0,0,1
...,...,...,...,...
feedback,1,0,0,0
),0,0,0,1
Managers,1,0,0,0
ReleaseSpeechesBanker,1,0,0,0
