# Imports

In [1]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from bs4 import BeautifulSoup
import requests
import pandas as pd
import validators
from urllib.parse import urlparse
import json
import string
from string import digits
from collections import Counter
import re
import os

In [2]:
%load_ext dotenv
%dotenv

# Initializing the Google Sheets API

In [3]:
# The ID and range of a sample spreadsheet.
SPREADSHEET_ID = os.getenv("SPREADSHEET_ID")
GENERAL_RANGE = "GENERAL!C:S"
AGRI_RANGE = "AGRICULTURE!C:S"
FIN_RANGE = "FINANCE!C:S"
DUP_RANGE = "DUPLICATES!A1:A1005"
UN_RANGE = "UNADDED!A:B"
EX_RANGE = "EXTRA!A1:A10001"
RANGES = [AGRI_RANGE, FIN_RANGE, GENERAL_RANGE]

# Accessing Sheets, finding duplicates and unadded links

## Generating token to access the API

In [4]:
# If modifying these scopes, delete the file write-token.json and read-token.json

# SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"] # For read-only scope
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]  # For writing to sheet

In [5]:
creds = None

if os.path.exists("write-token.json"):
    creds = Credentials.from_authorized_user_file("write-token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("write-token.json", "w") as token:
        token.write(creds.to_json())

service = build("sheets", "v4", credentials=creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=866438115849-109dmktdu3k3c8su4ict16qf3m8ehqal.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A60549%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=FkuxdTFIkt6kJmqjoDx6lGYvrNwU76&access_type=offline


## Getting Sitemaps from Gurturgoth and Anjor

In [6]:
# Gurturgoth Sitemaps
gg_sitemaps = [
    f"https://hanka.gurturgoth.com/post-sitemap{i}.xml" for i in range(1, 6 + 1)
]
gg_sitemaps

['https://hanka.gurturgoth.com/post-sitemap1.xml',
 'https://hanka.gurturgoth.com/post-sitemap2.xml',
 'https://hanka.gurturgoth.com/post-sitemap3.xml',
 'https://hanka.gurturgoth.com/post-sitemap4.xml',
 'https://hanka.gurturgoth.com/post-sitemap5.xml',
 'https://hanka.gurturgoth.com/post-sitemap6.xml']

In [7]:
# Anjor Sitemaps
an_sitemaps = [
    f"https://www.anjor.online/sitemap.xml?page={i}" for i in range(1, 13 + 1)
]
an_sitemaps

['https://www.anjor.online/sitemap.xml?page=1',
 'https://www.anjor.online/sitemap.xml?page=2',
 'https://www.anjor.online/sitemap.xml?page=3',
 'https://www.anjor.online/sitemap.xml?page=4',
 'https://www.anjor.online/sitemap.xml?page=5',
 'https://www.anjor.online/sitemap.xml?page=6',
 'https://www.anjor.online/sitemap.xml?page=7',
 'https://www.anjor.online/sitemap.xml?page=8',
 'https://www.anjor.online/sitemap.xml?page=9',
 'https://www.anjor.online/sitemap.xml?page=10',
 'https://www.anjor.online/sitemap.xml?page=11',
 'https://www.anjor.online/sitemap.xml?page=12',
 'https://www.anjor.online/sitemap.xml?page=13']

In [8]:
def get_sitemap_links(sitemaps):
    """
    Returns all the URLs found in the sitemaps
    """

    all_urls = set()
    header = {
        "Accept": "*/*",
        "Accept-Language": "*",
        "Accept-Encoding": "identity, gzip, deflate, compress, br",
        "User-Agent": "XY",
    }

    for sitemap in sitemaps:
        r = requests.get(sitemap, headers=header)
        xml = r.text

        soup = BeautifulSoup(xml)
        URLTags = soup.find_all("url")

        print(f"{sitemap}: {len(URLTags)} urls found")

        for URL in URLTags:
            all_urls.add(URL.findNext("loc").text)

    print(f"Total: {len(all_urls)} links found")

    return list(all_urls)

In [9]:
gg_links = get_sitemap_links(gg_sitemaps)

https://hanka.gurturgoth.com/post-sitemap1.xml: 1000 urls found
https://hanka.gurturgoth.com/post-sitemap2.xml: 1000 urls found
https://hanka.gurturgoth.com/post-sitemap3.xml: 1000 urls found
https://hanka.gurturgoth.com/post-sitemap4.xml: 1000 urls found
https://hanka.gurturgoth.com/post-sitemap5.xml: 1000 urls found
https://hanka.gurturgoth.com/post-sitemap6.xml: 57 urls found
Total: 5057 links found


In [10]:
an_links = get_sitemap_links(an_sitemaps)

https://www.anjor.online/sitemap.xml?page=1: 150 urls found
https://www.anjor.online/sitemap.xml?page=2: 150 urls found
https://www.anjor.online/sitemap.xml?page=3: 150 urls found
https://www.anjor.online/sitemap.xml?page=4: 150 urls found
https://www.anjor.online/sitemap.xml?page=5: 150 urls found
https://www.anjor.online/sitemap.xml?page=6: 150 urls found
https://www.anjor.online/sitemap.xml?page=7: 150 urls found
https://www.anjor.online/sitemap.xml?page=8: 150 urls found
https://www.anjor.online/sitemap.xml?page=9: 150 urls found
https://www.anjor.online/sitemap.xml?page=10: 150 urls found
https://www.anjor.online/sitemap.xml?page=11: 150 urls found
https://www.anjor.online/sitemap.xml?page=12: 150 urls found
https://www.anjor.online/sitemap.xml?page=13: 71 urls found
Total: 1871 links found


## Save the sitemaps as CSVs

In [11]:
def save_as_csv(filename, links, foldername=""):
    """
    Saves the sitemaps as CSV files
    filename: Name of the file to save the csv as
    links: a collection of links to save in the csv
    foldername: Name of the folder to save the file
    """
    df = pd.DataFrame(data={"links": list(links)})
    if foldername:
        if not os.path.exists(foldername):
            os.mkdir(foldername)
        df.to_csv(os.path.join(foldername, filename), sep=",", index=False)
    else:
        df.to_csv(filename, sep=",", index=False)

In [12]:
save_as_csv("anjor.csv", an_links, "sitemaps")
save_as_csv("gurtur.csv", gg_links, "sitemaps")

## Get the data summary from Google Sheets

In [13]:
# Call the Sheets API
sheet = service.spreadsheets()
result = sheet.values().batchGet(spreadsheetId=SPREADSHEET_ID, ranges=RANGES).execute()
ranges = result.get("valueRanges", [])

sheet_links = set()
duplicate_links = set()
link_count = 0

if not ranges:
    print("No data found.")
else:
    print(f"{len(ranges)} ranges retrieved.")
    for single_range in ranges:
        range_count = 0
        for row in single_range["values"]:
            if len(row) != 0:
                for item in row:
                    if validators.url(item):
                        range_count += 1
                        if item in sheet_links:
                            duplicate_links.add(item)
                        else:
                            sheet_links.add(item)
        print(f"Total links in {single_range['range']}:\t{range_count}")
        link_count += range_count
    print()
    print(f"Total links in sheet:\t{link_count}")
    print(f"Unique links in sheet:\t{len(sheet_links)}")
    print(f"Duplicate links:\t{len(duplicate_links)}")

3 ranges retrieved.
Total links in AGRICULTURE!C1:S1236:	358
Total links in FINANCE!C1:S1234:	234
Total links in GENERAL!C1:S1943:	820

Total links in sheet:	1412
Unique links in sheet:	1412
Duplicate links:	0


In [14]:
# Verify duplicates
count = 0
for link in list(duplicate_links):
    if link in sheet_links:
        count += 1
print(count)

0


In [15]:
# Saving the duplicates
save_as_csv("duplicates.csv", duplicate_links, "duplicates")

## Writing duplicates to Google Sheets [Caution: Can overwrite to sheet]

In [16]:
# Writing to Google Sheets
values = [["DUPLICATES"]]

for value in list(duplicate_links):
    values.append([value])

len_values = len(values)

for _ in range(len_values, 1000 + 1):
    values.append([""])

body = {"values": values}

value_input_option = "USER_ENTERED"

In [17]:
result = (
    service.spreadsheets()
    .values()
    .update(
        spreadsheetId=SPREADSHEET_ID,
        range=DUP_RANGE,
        valueInputOption=value_input_option,
        body=body,
    )
    .execute()
)
print(f"{result.get('updatedCells')} cells updated.")

1001 cells updated.


## Finding links Gurturgoth and Anjor that are not in the sheet

In [18]:
# Getting the un-needed extra links
result = (
    service.spreadsheets()
    .values()
    .get(spreadsheetId=SPREADSHEET_ID, range=EX_RANGE)
    .execute()
)
extra = result.get("values", [])
print(f"{len(extra)} rows retrieved.")

47 rows retrieved.


In [19]:
# Gurturgoth links not in sheet
unadded_gg_links = []
for link in gg_links:
    if link not in sheet_links and link not in extra:
        unadded_gg_links.append(link)

print(f"Number of Gurturgoth links not in sheet: {len(unadded_gg_links)}")

Number of Gurturgoth links not in sheet: 3850


In [20]:
# Anjor links not in sheet
unadded_an_links = []
for link in an_links:
    if link not in sheet_links and link not in extra:
        unadded_an_links.append(link)

print(f"Number of Anjor links not in sheet: {len(unadded_an_links)}")

Number of Anjor links not in sheet: 1712


In [21]:
save_as_csv("unadded_gg_links.csv", unadded_gg_links, "unadded_links")

In [22]:
save_as_csv("unadded_an_links.csv", unadded_an_links, "unadded_links")

## Writing unadded links to Sheet [Caution: Can overwrite to sheet]

In [23]:
# Writing to Google Sheets
values = [["GURTUR", "ANJOR"]]

for i, gg_value in enumerate(list(unadded_gg_links)):
    an_value = ""
    if i < len(unadded_an_links):
        an_value = unadded_an_links[i]
    values.append([gg_value, an_value])

len_values = len(values)

for _ in range(len_values, 5000 + 1):
    values.append(["", ""])

body = {"values": values}

value_input_option = "USER_ENTERED"

In [24]:
result = (
    service.spreadsheets()
    .values()
    .update(
        spreadsheetId=SPREADSHEET_ID,
        range=UN_RANGE,
        valueInputOption=value_input_option,
        body=body,
    )
    .execute()
)
print(f"{result.get('updatedCells')} cells updated.")

10002 cells updated.


# Extracting data from the collected links and cleaning them

## Generating token to access the API

In [25]:
# If modifying these scopes, delete the file write-token.json and read-token.json

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets.readonly"
]  # For read-only scope
# SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]  # For writing to sheet

In [26]:
creds = None

if os.path.exists("read-token.json"):
    creds = Credentials.from_authorized_user_file("read-token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("read-token.json", "w") as token:
        token.write(creds.to_json())

service = build("sheets", "v4", credentials=creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=866438115849-109dmktdu3k3c8su4ict16qf3m8ehqal.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A61210%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets.readonly&state=JAVOQTGIidz7BdlB79JKCM72xtwLNp&access_type=offline


## Helper Functions

In [27]:
def is_gurtur(link):
    parsed_uri = urlparse(link)
    result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
    return result == "https://hanka.gurturgoth.com/"

In [28]:
def is_anjor(link):
    parsed_uri = urlparse(link)
    result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
    return result == "https://www.anjor.online/"

In [29]:
def is_patrika(link):
    parsed_uri = urlparse(link)
    result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
    return result == "https://www.patrika.com/"

In [30]:
# Tests
assert is_gurtur("https://hanka.gurturgoth.com/mrs-devendra-kumari-singhdev/") == True
assert is_gurtur("https://www.anjor.online/2021/05/baldi-bai.jpg.html") == False
assert is_anjor("https://www.anjor.online/2021/05/baldi-bai.jpg.html") == True
assert is_anjor("https://hanka.gurturgoth.com/mrs-devendra-kumari-singhdev/") == False
assert is_patrika("https://hanka.gurturgoth.com/mrs-devendra-kumari-singhdev/") == False
assert is_patrika("https://www.anjor.online/2021/05/baldi-bai.jpg.html") == False
assert (
    is_patrika("https://www.patrika.com/raipur-news/chattisgarhi-sahitya-6522552/")
    == True
)

In [31]:
num_chars = 128
devnagri_chars = []
for i in range(num_chars):
    devnagri_chars.append(chr(ord("ऀ") + i))

In [32]:
def has_devnagri(text):
    return any(dev_char in text for dev_char in devnagri_chars)

## Get data from Sheets API

In [33]:
def read_from_link(link):
    """
    Gets the data from page source and extracts the contents if the
    page belongs to one of the three sites (Gurtur,Anjor, Patrika)
    """
    header = {
        "Accept": "*/*",
        "Accept-Language": "*",
        "Accept-Encoding": "identity, gzip, deflate, compress, br",
        "User-Agent": "XY",
    }
    content = ""

    try:
        page = requests.get(link, headers=header)
        soup = BeautifulSoup(page.content, "html.parser")
    except:
        return content

    if is_gurtur(link):
        if content_soup := soup.find("div", class_="entry-content"):
            contents = []
            if tags := content_soup.find_all(["p", "span", "div"]):
                for tag in tags:
                    useful_text = ""
                    for text in tag.find_all(text=True):
                        if has_devnagri(text):
                            useful_text = " ".join([useful_text, text.strip()])
                    contents.append(useful_text)
            content = (" ").join(contents).strip()

    elif is_anjor(link):
        if content_soup := soup.find("div", class_="entry-content"):
            contents = []
            if tags := content_soup.find_all(["p", "span", "div"]):
                for tag in tags:
                    useful_text = ""
                    for text in tag.find_all(text=True):
                        if has_devnagri(text):
                            useful_text = " ".join([useful_text, text.strip()])
                    contents.append(useful_text)
            content = (" ").join(contents).strip()

    elif is_patrika(link):
        if content_soup := soup.find("div", class_="complete-story"):
            contents = []
            if tags := content_soup.find_all(["p", "span", "div"]):
                for tag in tags:
                    useful_text = ""
                    for text in tag.find_all(text=True):
                        if has_devnagri(text):
                            useful_text = " ".join([useful_text, text.strip()])
                    contents.append(useful_text)
            content = (" ").join(contents).strip()

    return content

In [36]:
def extract(ranges_list=["AGRI", "FIN", "GEN"], verbose=False):
    """
    Extracts the links from the given domain and stores them in json files
    """

    ranges_to_extract = []
    if "AGRI" in ranges_list:
        ranges_to_extract.append(AGRI_RANGE)
    if "FIN" in ranges_list:
        ranges_to_extract.append(FIN_RANGE)
    if "GEN" in ranges_list:
        ranges_to_extract.append(GENERAL_RANGE)

    # Call the Sheets API
    sheet = service.spreadsheets()
    result = (
        sheet.values()
        .batchGet(spreadsheetId=SPREADSHEET_ID, ranges=ranges_to_extract)
        .execute()
    )
    ranges = result.get("valueRanges", [])

    unscraped = []

    if not os.path.exists("check"):
        os.mkdir("check")
    check_f = open(os.path.join("check", "check_content.txt"), "w", encoding="utf-8")

    sheet_links = set()
    link_count = 0
    last_topic = ""

    if not ranges:
        print("No data found.")
    else:
        print(f"{len(ranges)} ranges retrieved\n")

        # Iterate over all domains in ranges
        for range_index, single_range in enumerate(ranges):
            print(f"In range {ranges_list[range_index]}\n")
            range_count = 0

            # Create folder if not already exists
            folder_name = single_range["range"].split("!")[0]
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            for i, row in enumerate(single_range["values"]):
                index = 0
                if len(row) != 0 and i != 0:

                    # If topic exists in row
                    if row[0]:
                        last_topic = row[0]
                        subtopic = ""
                        if verbose:
                            print(f"Inside topic: {last_topic}")

                    # If subtopic exists in row
                    if len(row) > 1 and row[1] != "":
                        subtopic = row[1]
                        if verbose:
                            print(f"\tInside subtopic: {subtopic}")

                    for item in row:
                        if validators.url(item):
                            index += 1
                            range_count += 1

                            if (content := read_from_link(item)) == "":
                                unscraped.append(item)
                                continue
                            else:
                                check_f.write(item)
                                check_f.write("\n")
                                check_f.write(content)
                                check_f.write("\n\n")

                            data = {
                                "topic": last_topic,
                                "subtopic": subtopic,
                                "url": item,
                                "data": content,
                            }

                            if subtopic:
                                if not os.path.exists(
                                    os.path.join(folder_name, last_topic, subtopic)
                                ):
                                    os.makedirs(
                                        os.path.join(folder_name, last_topic, subtopic)
                                    )
                                with open(
                                    os.path.join(
                                        folder_name,
                                        last_topic,
                                        subtopic,
                                        f"{index}.json",
                                    ),
                                    "w",
                                    encoding="utf-8",
                                ) as f:
                                    json.dump(data, f)
                            else:
                                if not os.path.exists(
                                    os.path.join(folder_name, last_topic)
                                ):
                                    os.makedirs(os.path.join(folder_name, last_topic))
                                with open(
                                    os.path.join(
                                        folder_name, last_topic, f"{index}.json"
                                    ),
                                    "w",
                                    encoding="utf-8",
                                ) as f:
                                    json.dump(data, f)

            print(f"\nTotal links in {single_range['range']}:\t{range_count}\n")
            link_count += range_count

        print(f"\nTotal links in sheet:\t{link_count}")

    check_f.close()

    with open("unscraped.txt", "w", encoding="utf-8") as f:
        for link in unscraped:
            f.write(link)
            f.write("\n")

In [37]:
extract(["AGRI", "FIN", "GEN"], verbose=False)

3 ranges retrieved

In range AGRI


Total links in AGRICULTURE!C1:S1236:	358

In range FIN


Total links in FINANCE!C1:S1234:	234

In range GEN


Total links in GENERAL!C1:S1943:	820


Total links in sheet:	1412


## Verifying JSON data

In [38]:
f = open(os.path.join("AGRICULTURE", "Agricultural education", "1.json"))
sample_data = json.load(f)
sample_data

{'topic': 'Agricultural education',
 'subtopic': '',
 'url': 'https://hanka.gurturgoth.com/krishi-vigyan-kendra/',
 'data': 'बेमेतरा 19 मार्च 2021। कृषि विज्ञान केन्द्र, कृषि महाविद्यालय अउ अनुसंधान केन्द्र ढोलिया बेमेतरा के सापर तत्वधान म बिरस्पत 18 मार्च के जिला स्तरीय किसान मेला सह संगोष्ठी के आयोजन करे गइस। मेला के मुख्य उद्देश्य अलसी व दलहनी फसल अउ खरीफ/रबी फसल मन के बीज उत्पादन ल प्रोत्साहित करना रिहिन। (मेला अखिल भारतीय समन्वित अलसी अनुसंधान परियोजना, अखिल भारतीय समन्वित मुलार्प अनुसंधान परियोजना अउ राष्ट्रीय बीज परियोजना- मेगा सीड परियोजना डाहर ले प्रायोजित रिहिन।) कार्यक्रम म मुख्य अतिथि कृषि मंत्री श्री रविन्द्र चौबे  विशिष्ट अतिथि विधायक बेमेतरा श्री आशीष छाबड़ा, डा. एस. के. पाटील (कुलपति इंदिरागांधी कृषि विश्वविद्यालय रायपुर) डा. एस. सी. मुखर्जी निदेशक विस्तार सेवायें, इंदिरागांधी कृषि विश्वविद्यालय रायपुर, डा. आर. के. द्विवेदी अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र, कवर्धा, डाॅ. डी. एस. ठाकुर अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र, साजा, बंशी पटेल, श्रीमति प

## Search links from unadded, containing a particular substring

In [39]:
# Getting un-added links
result = (
    service.spreadsheets()
    .values()
    .get(spreadsheetId=SPREADSHEET_ID, range=UN_RANGE)
    .execute()
)
unadded = result.get("values", [])
print(f"{len(unadded)} rows retrieved.")

3851 rows retrieved.


In [40]:
gg_links = pd.read_csv(os.path.join("unadded_links", "unadded_an_links.csv"))
an_links = pd.read_csv(os.path.join("unadded_links", "unadded_gg_links.csv"))
combined_links = pd.concat([gg_links, an_links])

In [41]:
def find_links(combined_links, substring_list):
    """
    Gets all links from unadded that have a particular substring in them
    """
    matching_links = []
    for link in combined_links.iloc[:, 0].tolist():
        for substring in substring_list:
            if substring.lower() in link.lower():
                matching_links.append(link)
    return matching_links

In [42]:
# Testing
find_links(combined_links, ["kisan", "krishi"])

['https://www.anjor.online/2021/05/bhupesh-baghel-cm-cg-kisan-naya-yoajan.html',
 'https://www.anjor.online/2020/05/kheti-kisani.html',
 'https://www.anjor.online/2020/05/rajiv-gandhi-kisan-nyay-yojana.html',
 'https://www.anjor.online/2020/05/Rajiv-Gandhi-Kisan-Nyay-Yojana.html',
 'https://www.anjor.online/2020/04/kisan.html',
 'https://www.anjor.online/2021/03/rajiv-gandhi-kisan-nayay-yojana.html',
 'https://www.anjor.online/2020/08/bhupesh-baghel-kisan-yojana.html',
 'https://hanka.gurturgoth.com/rajiv-gandhi-kisan-nyay-yojana-to-be-launched-on-21st-may-in-chhattisgarh/',
 'https://hanka.gurturgoth.com/kondagaon-raghuram-kisan/',
 'https://hanka.gurturgoth.com/kisani-ke-goth/',
 'https://hanka.gurturgoth.com/sabal-kisan/',
 'https://hanka.gurturgoth.com/during-the-lockdown-the-bhoomgadi-mahila-kisan-group-served-food-access-to-the-house/',
 'https://hanka.gurturgoth.com/rajiv-gandhi-kisan-nyay-yojana/',
 'https://hanka.gurturgoth.com/kisan-nalkup-connection/',
 'https://hanka.gurtur

## Get links that have all topics, subtopics in them

In [43]:
# Call the Sheets API
sheet = service.spreadsheets()
result = sheet.values().batchGet(spreadsheetId=SPREADSHEET_ID, ranges=RANGES).execute()
ranges = result.get("valueRanges", [])

sheet_links = set()
link_count = 0
last_topic = ""

if not ranges:
    print("No data found.")
else:
    print(f"{len(ranges)} ranges retrieved.\n")

    # Iterate over all domains i.e Agri, Finance, General
    for single_range in ranges:
        range_list = set()

        for i, row in enumerate(single_range["values"]):
            index = 0
            if len(row) != 0 and i != 0:

                # If topic exists in row
                if row[0]:
                    range_list.add("-".join(row[0].strip(digits).lower().split()))
                    # print(f"Inside topic: {row[0]}")

                # If subtopic exists in row
                if len(row) > 1 and row[1] != "":
                    range_list.add("-".join(row[1].strip(digits).lower().split()))
                    # print(f"\tInside subtopic: {row[1]}")

        # Create file to store the lists
        file_name = os.path.join(
            "unadded_links",
            "unadded_" + single_range["range"].split("!")[0].lower() + ".txt",
        )
        with open(file_name, "w", encoding="utf-8") as f:
            for link in find_links(combined_links, list(range_list)):
                f.write(link)
                f.write("\n")
print("Done!")

3 ranges retrieved.

Done!


## Removing punctuations

In [44]:
check_content = open(
    os.path.join("check", "check_content.txt"), "r", encoding="utf-8"
).read()
len(check_content)

1815620

In [45]:
def remove_punct(text):
    text = re.sub(r"\s*[a-zA-Z]\s*", " ", text)  # Removing alphabets
    # text = re.sub('[0-9]', ' ' ,text) # Removing numbers
    # text = re.sub('[।]','. ',text) # Replacing पूर्ण विराम with full stop

    # text = re.sub("\.\s", "। ", text)

    text = re.sub(r"\s*’\s*", "'", text)
    text = re.sub(r"\s*‘\s*", "'", text)
    text = re.sub(r"\s*'\s*", "'", text)
    text = re.sub(r"\s*“\s*", '"', text)
    text = re.sub(r"\s*”\s*", '"', text)
    text = re.sub(r'\s*"\s*', '"', text)

    text = re.sub("\s*\xa0\s*", " ", text)

    text = re.sub(r"\s*[+*!?&^$|\\]+\s*", " ", text)
    text = re.sub(r"\s*[\([{})\]]\s*", " ", text)
    text = re.sub(r"\s*,\s*", " ", text)
    text = re.sub(r"\s*;\s*", " ", text)
    text = re.sub(r"\s*:\s*", " ", text)
    text = re.sub(r"\s*—\s*", " ", text)
    text = re.sub(r"\s*-\s*", " ", text)
    text = re.sub(r"\s*_\s*", " ", text)
    text = re.sub(r"\s*@\s*", " ", text)
    text = re.sub(r"\s*#\s*", " ", text)
    text = re.sub(r"\s*%\s*", " ", text)
    text = re.sub(r"\s*=\s*", " ", text)
    text = re.sub(r"\s*/\s*", " ", text)
    text = re.sub(r"\s*<\s*", " ", text)
    text = re.sub(r"\s*>\s*", " ", text)

    return text

In [46]:
# Testing
clean_content = remove_punct(check_content)
words = clean_content.split()
print(len(words))
print(len(set(words)))
words[0:15]

341951
21943


['.',
 '.',
 '2',
 '19',
 'मार्च',
 '2021।',
 'हमर',
 'प्रदेश',
 'के',
 'मुखिया',
 'दाऊ',
 'भूपेश',
 'बघेल',
 'ह',
 '21']

## Creating word counts

In [47]:
# Word count for entire vocabulary

dct = Counter()

word_count = 0
for word in words:
    word = word.strip()
    bad_words = "।‘’“”'\""
    word = word.translate(str.maketrans("", "", bad_words))
    word = word.strip(digits).strip(".")
    if has_devnagri(word):
        dct[word] = dct.get(word, 0) + 1
        word_count += 1

print(f"Word count: {word_count}")
print(f"Number of unique words: {len(dct)}")

if not os.path.exists("counts"):
    os.mkdir("counts")

with open(os.path.join("counts", "words.csv"), "w", encoding="utf-8") as f:
    f.write(f"word,count\n")
    for k, v in dct.most_common():
        f.write(f"{k},{v}\n")

Word count: 324879
Number of unique words: 19731


In [48]:
def create_word_count(domains):
    """
    Creates word counts for each domain
    """
    for domain in domains:
        word_count = 0
        count_dict = Counter()
        for root, dirs, files in os.walk(domain, topdown=True):
            for file in files:
                with open(os.path.join(root, file)) as f:
                    data = json.load(f)["data"]
                    cleaned_data = remove_punct(data)
                    words = cleaned_data.split()
                    for word in words:
                        word = word.strip()
                        bad_words = "।‘’“”'\""
                        word = word.translate(str.maketrans("", "", bad_words))
                        word = word.strip(digits).strip(".")
                        if has_devnagri(word):
                            count_dict[word] = count_dict.get(word, 0) + 1
                            word_count += 1
        if not os.path.exists("counts"):
            os.mkdir("counts")
        file_name = os.path.join("counts", domain.lower() + "_count.csv")
        print(f"Word count for {domain} is {word_count}")
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(f"word,count\n")
            for k, v in count_dict.most_common():
                f.write(f"{k},{v}\n")

In [49]:
create_word_count(["AGRICULTURE", "FINANCE", "GENERAL"])

Word count for AGRICULTURE is 88144
Word count for FINANCE is 64536
Word count for GENERAL is 169028


### Test: Check the number of files

In [50]:
# Check the number of files in the domains
def file_count(domains):
    links = []
    for domain in domains:
        file_count = 0
        for root, dirs, files in os.walk(domain):
            file_count += len(files)
            for file in files:
                with open(os.path.join(root, file)) as f:
                    links.append(json.load(f)["url"])
        print(f"{domain}: {file_count}")
    return links

In [51]:
fin_links = file_count(["FINANCE"])

FINANCE: 228


In [52]:
# Getting finance extra links
result = (
    service.spreadsheets()
    .values()
    .get(spreadsheetId=SPREADSHEET_ID, range=FIN_RANGE)
    .execute()
)
fins = result.get("values", [])
print(f"{len(fins)} rows retrieved.")

fin_links_sheet = []
for row in fins:
    for cell in row:
        if validators.url(cell):
            fin_links_sheet.append(cell)

415 rows retrieved.


In [53]:
print(len(fin_links), len(fin_links_sheet))

228 234


In [54]:
list(set(fin_links_sheet) - set(fin_links))

['https://36garhi.com/2021/05/26/%e0%a4%b8%e0%a5%8d%e0%a4%9f%e0%a5%80%e0%a4%b2-%e0%a4%89%e0%a4%a6%e0%a5%8d%e0%a4%af%e0%a5%8b%e0%a4%97-%e0%a4%ac%e0%a4%b0-%e0%a4%9b%e0%a4%a4%e0%a5%8d%e0%a4%a4%e0%a5%80%e0%a4%b8%e0%a4%97%e0%a5%9d/',
 'http://hanka.gurturgoth.com/%E0%A4%AE%E0%A5%81%E0%A4%82%E0%A4%97%E0%A5%87%E0%A4%B2%E0%A5%80-%E0%A4%95%E0%A5%87-%E0%A4%97%E0%A4%BE%E0%A4%82%E0%A4%B5-%E0%A4%AE%E0%A4%A8%E0%A4%95%E0%A5%80-%E0%A4%B8%E0%A5%8D%E0%A4%A5%E0%A4%BF/',
 'https://morchhattisgarhia.wordpress.com/2018/07/17/%E0%A4%B5%E0%A4%BF%E0%A4%95%E0%A4%BE%E0%A4%B8-%E0%A4%95%E0%A5%87-%E0%A4%AA%E0%A4%A5-%E0%A4%A4%E0%A5%87%E0%A4%9C%E0%A5%80-%E0%A4%B8%E0%A5%87-%E0%A4%A6%E0%A5%8C%E0%A5%9C%E0%A4%A4%E0%A5%87-%E0%A4%9B/',
 'https://morchhattisgarhia.wordpress.com/2018/05/10/%E0%A4%AC%E0%A5%87%E0%A4%B9%E0%A4%A4%E0%A4%B0-%E0%A4%B8%E0%A5%8D%E0%A4%B5%E0%A4%BE%E0%A4%B8%E0%A5%8D%E0%A4%A5%E0%A5%8D%E0%A4%AF-%E0%A4%B8%E0%A5%87%E0%A4%B5%E0%A4%BE%E0%A4%93%E0%A4%82-%E0%A4%B8/',
 'http://hanka.gurturgoth.com/%E0%A4%AE%E0%

## Separating Sentences

In [55]:
# Test
test_string = "बेमेतरा 19 मार्च 2021। कृषि विज्ञान केन्द्र, कृषि महाविद्यालय अउ अनुसंधान केन्द्र ढोलिया बेमेतरा के सापर तत्वधान म बिरस्पत 18 मार्च के जिला स्तरीय किसान मेला सह संगोष्ठी के आयोजन करे गइस। मेला के मुख्य उद्देश्य अलसी व दलहनी फसल अउ खरीफ/रबी फसल मन के बीज उत्पादन ल प्रोत्साहित करना रिहिन। (मेला अखिल भारतीय समन्वित अलसी अनुसंधान परियोजना, अखिल भारतीय समन्वित मुलार्प अनुसंधान परियोजना अउ राष्ट्रीय बीज परियोजना- मेगा सीड परियोजना डाहर ले प्रायोजित रिहिन।) कार्यक्रम म मुख्य अतिथि कृषि मंत्री श्री रविन्द्र चौबे  विशिष्ट अतिथि विधायक बेमेतरा श्री आशीष छाबड़ा, डा. एस. के. पाटील (कुलपति इंदिरागांधी कृषि विश्वविद्यालय रायपुर) डा. एस. सी. मुखर्जी निदेशक विस्तार सेवायें, इंदिरागांधी कृषि विश्वविद्यालय रायपुर, डा. आर. के. द्विवेदी अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र, कवर्धा, डाॅ. डी. एस. ठाकुर अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र, साजा, बंशी पटेल, श्रीमति प्रज्ञा निर्वाणी (जिला पंचायत सदस्य बेमेतरा) के संग जिला, जनपद अउ पंचायत के आने प्रतिनिधि मन के गरिमामय उपस्थिति रहीन। जिला प्रशासन से श्री दुर्गेश वर्मा एस.डी.एम., उपसंचालक कृषि श्री एम. डी. मानकर, डाॅ. के पी वर्मा अधिष्ठाता कृषि माहाविद्यालय ढोलिया (बेमेतरा), एस.डी.ओ. सोलंकी शर्मा अउ जम्मो  ब्लाक के एस.ए. डी.ओ./आर. ए. इ.ओ. उप संचालक उपस्थित रिहिन।  कृषि मंत्री श्री रविन्द्र चौबे डाहर ले कृषि विज्ञान केन्द्र, कृषि महाविद्यालय अउ अनुसंधान केन्द्र, बेमेतरा के काम—काज अउ उदीम मन ल सहराए गहस। संगे —संग वैज्ञानिक मन ले कृषि क्षेत्र में किसान मन ल उन्नत कृषि कोति ले जाए अउ कृषि के भरोसा सशक्तिकरण के बात कहे गईन। विधायक के द्वारा भी कृषि और कृषकों को कृषि विज्ञान केन्द्र से मिलने वाले लाभों की सराहना की। किसान मेला म कृषि उद्यानिकी, मत्स्य व पशु विभाग के सहयोग रिहिन अउ स्टाल तको लगाये गेहे रिहिन। ये बेरा सोयाबीन सीड हब-बीज भण्डार गृह के भूमि पूजन अउ एनएचएम-एमएडीएच अंतर्गत स्थापित लघु मातृ वाटिका (नान्हे नर्सरी इकाई) के लोकार्पण तको करे गहस।"

In [56]:
# Test
cleaned_test_string = remove_punct(test_string)
cleaned_test_string

'बेमेतरा 19 मार्च 2021। कृषि विज्ञान केन्द्र कृषि महाविद्यालय अउ अनुसंधान केन्द्र ढोलिया बेमेतरा के सापर तत्वधान म बिरस्पत 18 मार्च के जिला स्तरीय किसान मेला सह संगोष्ठी के आयोजन करे गइस। मेला के मुख्य उद्देश्य अलसी व दलहनी फसल अउ खरीफ रबी फसल मन के बीज उत्पादन ल प्रोत्साहित करना रिहिन। मेला अखिल भारतीय समन्वित अलसी अनुसंधान परियोजना अखिल भारतीय समन्वित मुलार्प अनुसंधान परियोजना अउ राष्ट्रीय बीज परियोजना मेगा सीड परियोजना डाहर ले प्रायोजित रिहिन। कार्यक्रम म मुख्य अतिथि कृषि मंत्री श्री रविन्द्र चौबे  विशिष्ट अतिथि विधायक बेमेतरा श्री आशीष छाबड़ा डा. एस. के. पाटील कुलपति इंदिरागांधी कृषि विश्वविद्यालय रायपुर डा. एस. सी. मुखर्जी निदेशक विस्तार सेवायें इंदिरागांधी कृषि विश्वविद्यालय रायपुर डा. आर. के. द्विवेदी अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र कवर्धा डाॅ. डी. एस. ठाकुर अधिष्ठाता कृषि महाविद्यालय अउ अनुसंधान केन्द्र साजा बंशी पटेल श्रीमति प्रज्ञा निर्वाणी जिला पंचायत सदस्य बेमेतरा के संग जिला जनपद अउ पंचायत के आने प्रतिनिधि मन के गरिमामय उपस्थिति रहीन। जिला प्रशासन से श्री दुर

In [57]:
def clean_sentences(contents):
    """
    Get a list of cleaned sentences
    """
    cleaned_sentences = []
    for sentence in re.split("[।\n]", contents):
        cleaned_sentence = remove_punct(sentence).strip()
        if 8 < len(cleaned_sentence) < 1024 and has_devnagri(cleaned_sentence):
            cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences

In [58]:
if not os.path.exists("sentences"):
    os.mkdir("sentences")

In [60]:
agri_check = open(os.path.join("check", "agri_check.txt"), "r", encoding="utf-8").read()
agri_sent = clean_sentences(agri_check)

print(f"Number of lines in file: {len(agri_check)}")
print(f"Number of sentences in file: {len(agri_sent)}")

with open(os.path.join("sentences", "agri.txt"), "w", encoding="utf-8") as f:
    for sentence in agri_sent:
        f.write(f"{sentence}\n")

Number of lines in file: 483728
Number of sentences in file: 3939


In [61]:
fin_check = open(os.path.join("check", "fin_check.txt"), "r", encoding="utf-8").read()
fin_sent = clean_sentences(fin_check)

print(f"Number of lines in file: {len(fin_check)}")
print(f"Number of sentences in file: {len(fin_sent)}")

with open(os.path.join("sentences", "fin.txt"), "w", encoding="utf-8") as f:
    for sentence in fin_sent:
        f.write(f"{sentence}\n")

Number of lines in file: 356831
Number of sentences in file: 2786


In [62]:
gen_check = open(os.path.join("check", "gen_check.txt"), "r", encoding="utf-8").read()
gen_sent = clean_sentences(gen_check)

print(f"Number of lines in file: {len(gen_check)}")
print(f"Number of sentences in file: {len(gen_sent)}")

with open(os.path.join("sentences", "gen.txt"), "w", encoding="utf-8") as f:
    for sentence in gen_sent:
        f.write(f"{sentence}\n")

Number of lines in file: 975057
Number of sentences in file: 7991


In [63]:
all_check = open(
    os.path.join("check", "check_content.txt"), "r", encoding="utf-8"
).read()
all_sent = clean_sentences(all_check)

print(f"Number of lines in file: {len(all_check)}")
print(f"Number of sentences in file: {len(all_sent)}")

with open(os.path.join("sentences", "all.txt"), "w", encoding="utf-8") as f:
    for sentence in all_sent:
        f.write(f"{sentence}\n")

Number of lines in file: 1815620
Number of sentences in file: 14716
