In [1]:
%run utils/devtools.py

In [24]:
%reloadmypkg utils

import pandas as pd
from utils.url2platform import *
from utils.standardise_url import *
from utils.fetch_info import *
from openpyxl import load_workbook
from utils.dump_df_to_json import *

✅ Reloaded package 'utils' and its submodules.


In [3]:
file_path = "../raw-data/love-babbar/450.xlsx"
export_file = "../cleaned-data/lb/450.json"

In [4]:
problems_urls_titles = []
tutorials_urls_titles = []

wb = load_workbook(file_path)
ws = wb.active

data = [
    {
        "original_title": cell.value,
        "url": cell.hyperlink.target,
    }
    if cell.hyperlink else
    {
        "original_title": cell.value,
        "url": None,
    }
    for row in ws.iter_rows(min_row=6)
    for cell in [row[1]]
    if cell.value is not None and cell.value != '<->' and isinstance(cell.value, str)
]

df = pd.DataFrame(data)

In [7]:
df.shape

(448, 2)

In [8]:
missing_urls = df[df["url"].isna()]

In [9]:
missing_urls.shape

(3, 2)

In [10]:
df.at[49, "url"] = "https://www.scaler.com/topics/why-string-is-immutable-in-java/"
df.at[150, "url"] = "https://www.naukri.com/code360/library/is-it-possible-to-reverse-a-linked-list-in-less-than-o-n"
df.at[151, "url"] = "https://www.naukri.com/code360/library/why-is-quick-sort-preferred-for-arrays-and-merge-sort-for-linked-lists"
df.at[151, "original_title"] = "Why is quick sort preferred for arrays and merge sort for linked lists?"

In [11]:
print(df.loc[49])
print(df.loc[150])
print(df.loc[151])

original_title                   Why strings are immutable in Java?
url               https://www.scaler.com/topics/why-string-is-im...
Name: 49, dtype: object
original_title     Can we reverse a linked list in less than O(n) ?
url               https://www.naukri.com/code360/library/is-it-p...
Name: 150, dtype: object
original_title    Why is quick sort preferred for arrays and mer...
url               https://www.naukri.com/code360/library/why-is-...
Name: 151, dtype: object


In [12]:
malformed_list = [
    "https://practice.geeksforgeeks.org/problems/merge-two-sorted-arrays5135/1",
    'https://practice.geeksforgeeks.org/problems/overlapping-intervals/0',
    'https://www.hackerearth.com/practice/algorithms/searching/binary-search/practice-problems/algorithm/bishu-and-soldiers/',
    'http://theoryofprogramming.com/2017/12/16/find-pivot-element-sorted-rotated-array/',
    'https://www.baeldung.com/java-sorting-arrays-with-repeated-entries',
    'https://stackoverflow.com/questions/45130465/inserting-at-the-end-of-stack',
    'https://www.tutorialspoint.com/javaexamples/data_stack.htm',
    'https://www.techiedelight.com/inorder-tree-traversal-iterative-recursive/',
    'https://www.techiedelight.com/preorder-tree-traversal-iterative-recursive/',
    'https://www.techiedelight.com/postorder-tree-traversal-iterative-recursive/',
    'https://1drv.ms/t/s!AqTOHFO77CqEiRua06v1PATyiFg5'
]

malformed_urls = df[df['url'].isin(malformed_list)]
malformed_urls.shape

(12, 2)

In [13]:
# Fix broken links/add better links in the sheet
df.at[11, "url"] = "https://www.geeksforgeeks.org/problems/merge-two-sorted-arrays-1587115620/0"
df.at[102, "url"] = "https://www.geeksforgeeks.org/problems/merge-two-sorted-arrays-1587115620/0"
df.at[107, "url"] = "https://www.hackerearth.com/problem/algorithm/bishu-and-soldiers-227/"
df.at[110, "url"] = "https://leetcode.com/problems/search-in-rotated-sorted-array/"
df.at[124, "url"] = "https://www.naukri.com/code360/problems/partitioning-and-sorting-arrays-with-many-repeated-entries_1170515"
df.at[166, "url"] = "https://leetcode.com/problems/binary-tree-inorder-traversal/"
df.at[167, "url"] = "https://leetcode.com/problems/binary-tree-preorder-traversal/"
df.at[168, "url"] = "https://leetcode.com/problems/binary-tree-postorder-traversal/"
df.at[272, "url"] = "https://www.programiz.com/dsa/stack"
df.at[284, "url"] = "https://www.naukri.com/code360/problems/insert-an-element-at-its-bottom-in-a-given-stack_1171166"
df.at[287, "url"] = "https://www.geeksforgeeks.org/problems/overlapping-intervals--170633/0"
df.at[328, "url"] = "https://www.naukri.com/code360/problems/create-a-graph-and-print-it_1214551"

In [14]:
# Find and print the duplicates

dupes_df = df[df.duplicated(subset="url", keep=False)]

In [15]:
dupe_counts = dupes_df.groupby("url").size()
print(f"Total unique URLs with duplicates: {len(dupe_counts)}")

Total unique URLs with duplicates: 27


In [16]:
dupes_df.shape

(57, 2)

In [17]:
df.shape

(448, 2)

In [18]:
import nest_asyncio
nest_asyncio.apply()

standardised_df = await standardise_urls_async(df, "lb-450-urls-expanded.json")

Filter list is empty. Trying to expand all 448 URLs.
Resume option is set to false. Deleted previous checkpoint file: lb-450-urls-expanded.json

Processing batch 1/3
Loaded 0 results from checkpoint
Processing 200 pending URLs in this batch


Expanding URLs: 100%|██████████| 200/200 [00:06<00:00, 33.03it/s]


Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------
Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------

Processing batch 2/3
Loaded 200 results from checkpoint
Processing 200 pending URLs in this batch


Expanding URLs: 100%|██████████| 200/200 [00:05<00:00, 37.02it/s]


Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------
Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------

Processing batch 3/3
Loaded 400 results from checkpoint
Processing 48 pending URLs in this batch


Expanding URLs: 100%|██████████| 48/48 [00:03<00:00, 15.98it/s]

Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------
Progress saved to lb-450-urls-expanded.json
----------------------------------------------------------------------

448/448 URLs expanded in 21.56 seconds.





In [19]:
standardised_df.shape

(448, 4)

In [20]:
# Adding type column
standardised_df['type'] = standardised_df['expanded_stripped_url'].apply(
    lambda url: "problem" if problem_url2platform(url) != ProblemPlatform.UNKNOWN else "tutorial"
)

In [21]:
standardised_df.shape

(448, 5)

In [None]:
# Adding id_base, id, title, platform columns
import nest_asyncio
nest_asyncio.apply()

final_df = await fetch_info(standardised_df, "lb-450-info-added.json", resume=True)

Found 448 URLs.

Batch 1/3: items 0-199
Loaded 447 from checkpoint
Nothing new in this batch
Saved progress (447 items) to lb-450-info-added.json
------------------------------

Batch 2/3: items 200-399
Loaded 447 from checkpoint
Processing 1 URLs...


Fetching items:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching items: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]

Saved progress (448 items) to lb-450-info-added.json
------------------------------
Saved progress (448 items) to lb-450-info-added.json
------------------------------

Batch 3/3: items 400-447
Loaded 448 from checkpoint
Nothing new in this batch
Saved progress (448 items) to lb-450-info-added.json
------------------------------

Fetched 448/448 items in 0.8s.





In [28]:
final_df.shape

(448, 9)

In [29]:
dump_df_to_json(final_df, export_file)