## Common Crawl Collapse

In [1]:
import numpy as np
import pandas as pd
import janitor
from pathlib import Path
from IPython.display import display
from utilities import url_is_html

In [2]:
def combine_parquet_files_to_df(parquet_folder):
    """
    Iterates over all Parquet files in the specified folder and combines them into a single DataFrame.

    Args:
        parquet_folder (str): Path to the folder containing Parquet files.

    Returns:
        pandas.DataFrame: The combined DataFrame.
    """
    folder = Path(parquet_folder)
    parquet_files = list(folder.glob("*.parquet"))
    print(f"Found {len(parquet_files)} Parquet files to process.")

    df_list = []
    for file in parquet_files:
        try:
            df = pd.read_parquet(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [3]:
folder_path = "../data/common_crawl"
combined_df = combine_parquet_files_to_df(folder_path)
display(combined_df.head(3))
print(f"Combined DataFrame shape: {combined_df.shape}")

Found 166 Parquet files to process.


Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,languages,encoding,country,pattern,truncated,redirect
0,"mn,gov,aaib)/",20240725015042,http://aaib.gov.mn/,text/html,text/html,200,P2RZ6DWRMUESDBRSN3EZT2CJA3WDMXLH,7261,1185280,crawl-data/CC-MAIN-2024-30/segments/1720763518...,"mon,eng",UTF-8,Mongolia,*.gov.mn,,
1,"mn,gov,aaib)/?cat=21",20240725015116,http://aaib.gov.mn/?cat=21,text/html,text/html,200,PXG5YAFN53QYJEOKW5YIRKEXK5D4BW5I,7265,769967,crawl-data/CC-MAIN-2024-30/segments/1720763518...,"mon,eng",UTF-8,Mongolia,*.gov.mn,,
2,"mn,gov,aaib)/?cat=34",20240724233612,https://aaib.gov.mn/?cat=34,text/html,text/html,200,VNPQMWRUILOEFAVTE53LPQARSATHRWGR,7309,66511033,crawl-data/CC-MAIN-2024-30/segments/1720763518...,"mon,eng",UTF-8,Mongolia,*.gov.mn,,


Combined DataFrame shape: (27177089, 16)


In [4]:
combined_df["is_html"] = combined_df["url"].map(url_is_html)

In [5]:
n_total = len(combined_df)
n_html = len(combined_df.query("is_html == True"))
pct_html = n_html / n_total * 100

print(f"{n_html:,} rows are HTML ({pct_html:.1f}% of {n_total:,} total rows)")

25,221,114 rows are HTML (92.8% of 27,177,089 total rows)


In [6]:
n = 1000
seed = 42
np.random.seed(seed)

df_sampled = (
    combined_df
    .query("is_html==True")
    .remove_columns("is_html")
    .groupby("country")
    .apply(lambda x: x.sample(n=n, random_state=seed) if len(x) >= n else x)
    .reset_index(drop=True)
)

print(df_sampled["country"].value_counts())

  return method(self._obj, *args, **kwargs)


country
Afghanistan      1000
Albania          1000
Algeria          1000
Angola           1000
Argentina        1000
                 ... 
Tajikistan        195
Djibouti          185
Denmark            65
Guinea-Bissau       5
Finland             2
Name: count, Length: 159, dtype: int64


  .apply(lambda x: x.sample(n=n, random_state=seed) if len(x) >= n else x)


In [9]:
df_sampled.to_csv("../data/common_crawl_sample_n1000.csv", index=False)

In [8]:
# We had to take out row 6293 as GitHub felt a key was being committed