In [20]:
%run utils/devtools.py

In [21]:
%reloadmypkg utils

import pandas as pd
from utils.url2platform import *
from utils.standardise_url import *
from utils.fetch_info import *
from utils.dump_df_to_json import *

✅ Reloaded package 'utils' and its submodules.


In [22]:
sheet_path = "../raw-data/striver/blind-75-response-data.json"
export_file = "../cleaned-data/striver/75.json"

In [23]:
with open(sheet_path) as file:
    raw = json.load(file)

flattened_data = [
    {
        "gfg": topic["gfg_link"],
        "c360": topic["cs_link"],
        "lc": topic["lc_link"],
        "original_title": topic["title"],
        "yt": topic["yt_link"],
        "post": topic["post_link"],
        "step_title": topic["head_step_no"],
    }
    for step in raw["sheetData"]
    for topic in step["topics"]
]

df = pd.DataFrame(flattened_data)

In [24]:
def get_best_url(row):
    return (
        row["lc"] or
        row["gfg"] or
        row["c360"] or
        row["yt"] or
        row["post"]
    )

In [25]:
df["url"] = df.apply(get_best_url, axis=1)

In [26]:
df = df[["url", "original_title", "step_title"]]

In [27]:
df.shape

(75, 3)

In [28]:
missing_urls = df[df["url"].isna()]
missing_urls.shape

(0, 3)

In [29]:
# Find the duplicates

dupes_df = df[df.duplicated(subset="url", keep=False)]
dupes_df.shape

(0, 3)

In [30]:
dupe_counts = dupes_df.groupby("url").size()
print(f"Total unique URLs with duplicates: {len(dupe_counts)}")

Total unique URLs with duplicates: 0


In [31]:
import nest_asyncio
nest_asyncio.apply()

standardised_df = await standardise_urls_async(df, "striver-75-urls-expanded.json")

Filter list is empty. Trying to expand all 75 URLs.
Resume option is set to false. Deleted previous checkpoint file: striver-75-urls-expanded.json

Processing batch 1/1
Loaded 0 results from checkpoint
Processing 75 pending URLs in this batch


Expanding URLs: 100%|██████████| 75/75 [00:01<00:00, 54.01it/s]

Progress saved to striver-75-urls-expanded.json
----------------------------------------------------------------------
Progress saved to striver-75-urls-expanded.json
----------------------------------------------------------------------

75/75 URLs expanded in 2.57 seconds.





In [32]:
standardised_df.shape

(75, 5)

In [33]:
# Adding type column
standardised_df['type'] = standardised_df['expanded_stripped_url'].apply(
    lambda url: "problem" if is_problem_url(url) else "tutorial"
)

In [34]:
standardised_df.shape

(75, 6)

In [35]:
standardised_df.head(1)

Unnamed: 0,url,original_title,step_title,expanded_url,expanded_stripped_url,type
0,https://leetcode.com/problems/two-sum/,2Sum Problem,Array,https://leetcode.com/problems/two-sum/,https://leetcode.com/problems/two-sum/,problem


In [36]:
# Adding id, title, platform columns
import nest_asyncio
nest_asyncio.apply()

final_df = await fetch_info(standardised_df, "striver-75-info-added.json")

Found 75 URLs.
Resume is disabled. Deleting old checkpoint...

Batch 1/1: items 0-74
Loaded 0 from checkpoint
Processing 75 URLs...


Fetching items: 100%|██████████| 75/75 [00:02<00:00, 34.46it/s]

Saved progress (75 items) to striver-75-info-added.json
------------------------------
Saved progress (75 items) to striver-75-info-added.json
------------------------------

Fetched 75/75 items in 2.2s.





In [37]:
final_df.shape

(75, 9)

In [38]:
dump_df_to_json(final_df, export_file)