In [53]:
%run utils/devtools.py

In [54]:
%reloadmypkg utils

import pandas as pd
from utils.url2platform import *
from utils.standardise_url import *
from utils.fetch_info import *
from utils.dump_df_to_json import *

✅ Reloaded package 'utils' and its submodules.


In [55]:
sheet_path = "../raw-data/striver/a2z-sheet-response-data.json"
export_file = "../cleaned-data/striver/a2z.json"

In [56]:
with open(sheet_path) as file:
    raw = json.load(file)

flattened_data = [
    {
        "gfg": topic["gfg_link"],
        "c360": topic["cs_link"],
        "lc": topic["lc_link"],
        "original_title": topic["question_title"],
        "yt": topic["yt_link"],
        "post": topic["post_link"],
        "step_title": topic["step_title"],
        "sub_step_title": topic["sub_step_title"],
    }
    for step in raw["sheetData"]
    for substep in step["sub_steps"]
    for topic in substep["topics"]
]

df = pd.DataFrame(flattened_data)

In [57]:
def get_best_url(row):
    return (
        row["lc"] or
        row["gfg"] or
        row["c360"] or
        row["yt"] or
        row["post"]
    )

In [58]:
df["url"] = df.apply(get_best_url, axis=1)

In [59]:
df = df[["url", "original_title", "step_title", "sub_step_title"]]

In [60]:
df.shape

(455, 4)

In [61]:
missing_urls = df[df["url"].isna()]
missing_urls.shape

(1, 4)

In [62]:
# Adding link for Java Collections as it is missing
df.at[11, "url"] = "https://www.youtube.com/playlist?list=PLA3GkZPtsafZZsLj0Tybu3y0HVl-hp1ea"

In [63]:
incorrect_urls = df[df["url"] == "https://leetcode.com/accounts/login/?next=/problems/find-the-celebrity/"]
incorrect_urls.shape

(1, 4)

In [64]:
# Replace premium LeetCode problem with LintCode link
df.at[226, "url"] = "https://www.lintcode.com/problem/645/"

In [65]:
redirect_urls = df[df["url"].str.contains("implement-strstr")]
redirect_urls.shape

(2, 4)

In [66]:
# Fix the redirect urls
df.at[450, "url"] = "https://leetcode.com/problems/find-the-index-of-the-first-occurrence-in-a-string/"
df.at[451, "url"] = "https://leetcode.com/problems/find-the-index-of-the-first-occurrence-in-a-string/"

In [67]:
slightly_wrong_url = df[df["url"].str.contains("coin-change-2")]
slightly_wrong_url.shape

(1, 4)

In [68]:
# Fix the redirect urls
df.at[404, "url"] = "https://leetcode.com/problems/coin-change-ii/"

In [69]:
# Find the duplicates

dupes_df = df[df.duplicated(subset="url", keep=False)]
dupes_df.shape

(48, 4)

In [70]:
dupe_counts = dupes_df.groupby("url").size()
print(f"Total unique URLs with duplicates: {len(dupe_counts)}")

Total unique URLs with duplicates: 22


In [71]:
import nest_asyncio
nest_asyncio.apply()

# filter_list = ["bit.ly", "codingninjas.com/codestudio", "codingninjas.com/studio", "//youtu.be"]
standardised_df = await standardise_urls_async(df, "striver-a2z-urls-expanded.json")

Filter list is empty. Trying to expand all 455 URLs.

Processing batch 1/3
Loaded 0 results from checkpoint
Processing 200 pending URLs in this batch


Expanding URLs: 100%|██████████| 200/200 [00:03<00:00, 58.00it/s] 


Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------
Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------

Processing batch 2/3
Loaded 200 results from checkpoint
Processing 200 pending URLs in this batch


Expanding URLs: 100%|██████████| 200/200 [00:03<00:00, 65.20it/s] 


Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------
Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------

Processing batch 3/3
Loaded 400 results from checkpoint
Processing 55 pending URLs in this batch


Expanding URLs: 100%|██████████| 55/55 [00:02<00:00, 18.39it/s]

Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------
Progress saved to striver-a2z-urls-expanded.json
----------------------------------------------------------------------

455/455 URLs expanded in 16.64 seconds.





In [72]:
standardised_df.shape

(455, 6)

In [73]:
# Adding type column
standardised_df['type'] = standardised_df['expanded_stripped_url'].apply(
    lambda url: "problem" if is_problem_url(url) else "tutorial"
)

In [74]:
standardised_df.shape

(455, 7)

In [75]:
standardised_df.head(1)

Unnamed: 0,url,original_title,step_title,sub_step_title,expanded_url,expanded_stripped_url,type
0,https://practice.geeksforgeeks.org/problems/se...,User Input / Output,Learn the basics,Things to Know in C++/Java/Python or any language,https://www.geeksforgeeks.org/problems/search-...,https://www.geeksforgeeks.org/problems/search-...,problem


In [76]:
# Adding id_base, id, title, platform columns
import nest_asyncio
nest_asyncio.apply()

final_df = await fetch_info(standardised_df, "striver-a2z-info-added.json")

Found 455 URLs.

Batch 1/3: items 0-199
Loaded 0 from checkpoint
Processing 200 URLs...


Fetching items: 100%|██████████| 200/200 [00:03<00:00, 59.46it/s] 


Saved progress (200 items) to striver-a2z-info-added.json
------------------------------
Saved progress (200 items) to striver-a2z-info-added.json
------------------------------

Batch 2/3: items 200-399
Loaded 200 from checkpoint
Processing 200 URLs...


Fetching items: 100%|██████████| 200/200 [00:03<00:00, 65.02it/s] 


Saved progress (400 items) to striver-a2z-info-added.json
------------------------------
Saved progress (400 items) to striver-a2z-info-added.json
------------------------------

Batch 3/3: items 400-454
Loaded 400 from checkpoint
Processing 55 URLs...


Fetching items: 100%|██████████| 55/55 [00:02<00:00, 26.89it/s]

Saved progress (455 items) to striver-a2z-info-added.json
------------------------------
Saved progress (455 items) to striver-a2z-info-added.json
------------------------------

Fetched 455/455 items in 8.7s.





In [77]:
final_df.shape

(455, 11)

In [78]:
dump_df_to_json(final_df, export_file)