In [22]:
%run utils/devtools.py

In [23]:
%reloadmypkg utils

import pandas as pd
from utils.url2platform import *
from utils.standardise_url import *
from utils.fetch_info import *
from utils.dump_df_to_json import *

✅ Reloaded package 'utils' and its submodules.


In [24]:
sheet_path = "../raw-data/striver/79-sheet-response-data.json"
export_file = "../cleaned-data/striver/79.json"

In [25]:
with open(sheet_path) as file:
    raw = json.load(file)

flattened_data = [
    {
        "gfg": topic["gfg_link"],
        "c360": topic["cs_link"],
        "lc": topic["lc_link"],
        "original_title": topic["title"],
        "yt": topic["yt_link"],
        "post": topic["post_link"],
        "step_title": topic["head_step_no"],
    }
    for step in raw["sheetData"]
    for topic in step["topics"]
]

df = pd.DataFrame(flattened_data)

In [26]:
def get_best_url(row):
    return (
        row["lc"] or
        row["gfg"] or
        row["c360"] or
        row["yt"] or
        row["post"]
    )

In [27]:
df["url"] = df.apply(get_best_url, axis=1)

In [28]:
df = df[["url", "original_title", "step_title"]]

In [29]:
df.shape

(79, 3)

In [30]:
missing_urls = df[df["url"].isna()]
missing_urls.shape

(0, 3)

In [31]:
df[df['url'].str.contains("implement-strstr")]

Unnamed: 0,url,original_title,step_title
77,https://leetcode.com/problems/implement-strstr/,Z-Function,String
78,https://leetcode.com/problems/implement-strstr/,KMP algo / LPS(pi) array,String


In [32]:
# Fix the redirect urls
df.at[77, "url"] = "https://leetcode.com/problems/find-the-index-of-the-first-occurrence-in-a-string/"
df.at[78, "url"] = "https://leetcode.com/problems/find-the-index-of-the-first-occurrence-in-a-string/"

In [33]:
# Find the duplicates

dupes_df = df[df.duplicated(subset="url", keep=False)]
dupes_df.shape

(2, 3)

In [34]:
dupe_counts = dupes_df.groupby("url").size()
print(f"Total unique URLs with duplicates: {len(dupe_counts)}")

Total unique URLs with duplicates: 1


In [35]:
import nest_asyncio
nest_asyncio.apply()

standardised_df = await standardise_urls_async(df, "striver-79-urls-expanded.json")

Filter list is empty. Trying to expand all 79 URLs.
Resume option is set to false. Deleted previous checkpoint file: striver-79-urls-expanded.json

Processing batch 1/1
Loaded 0 results from checkpoint
Processing 79 pending URLs in this batch


Expanding URLs: 100%|██████████| 79/79 [00:01<00:00, 48.52it/s]

Progress saved to striver-79-urls-expanded.json
----------------------------------------------------------------------
Progress saved to striver-79-urls-expanded.json
----------------------------------------------------------------------

79/79 URLs expanded in 2.87 seconds.





In [36]:
standardised_df.shape

(79, 5)

In [37]:
# Adding type column
standardised_df['type'] = standardised_df['expanded_stripped_url'].apply(
    lambda url: "problem" if is_problem_url(url) else "tutorial"
)

In [38]:
standardised_df.shape

(79, 6)

In [39]:
standardised_df.head(1)

Unnamed: 0,url,original_title,step_title,expanded_url,expanded_stripped_url,type
0,https://leetcode.com/problems/next-permutation/,Next Permutation,Arrays and Hashing,https://leetcode.com/problems/next-permutation/,https://leetcode.com/problems/next-permutation/,problem


In [40]:
# Adding id, title, platform columns
import nest_asyncio
nest_asyncio.apply()

final_df = await fetch_info(standardised_df, "striver-79-info-added.json")

Found 79 URLs.
Resume is disabled. Deleting old checkpoint...

Batch 1/1: items 0-78
Loaded 0 from checkpoint
Processing 79 URLs...


Fetching items: 100%|██████████| 79/79 [00:02<00:00, 34.95it/s]

Saved progress (79 items) to striver-79-info-added.json
------------------------------
Saved progress (79 items) to striver-79-info-added.json
------------------------------

Fetched 79/79 items in 2.3s.





In [41]:
final_df.shape

(79, 9)

In [42]:
dump_df_to_json(final_df, export_file)