### Common Crawl Collapse

In [10]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

In [5]:
def combine_parquet_files_to_df(parquet_folder):
    """
    Iterates over all Parquet files in the specified folder and combines them into a single DataFrame.

    Args:
        parquet_folder (str): Path to the folder containing Parquet files.

    Returns:
        pandas.DataFrame: The combined DataFrame.
    """
    folder = Path(parquet_folder)
    parquet_files = list(folder.glob("*.parquet"))
    print(f"Found {len(parquet_files)} Parquet files to process.")

    df_list = []
    for file in parquet_files:
        try:
            df = pd.read_parquet(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [6]:
folder_path = "../data/common_crawl"
combined_df = combine_parquet_files_to_df(folder_path)
print(f"Combined DataFrame shape: {combined_df.shape}")

Found 167 Parquet files to process.
Combined DataFrame shape: (27190255, 16)


In [8]:
combined_df.columns

Index(['urlkey', 'timestamp', 'url', 'mime', 'mime-detected', 'status',
       'digest', 'length', 'offset', 'filename', 'country', 'pattern',
       'languages', 'encoding', 'redirect', 'truncated'],
      dtype='object')

In [11]:
np.random.seed(42)

df_sampled = (
    df.groupby('country', group_keys=False, include_groups=False)
      .apply(lambda x: x.sample(n=100, random_state=42) if len(x) >= 100 else x)
      .reset_index(drop=True)
)

print(df_sampled['country'].value_counts())

  .apply(lambda x: x.sample(n=100, random_state=42) if len(x) >= 100 else x)


country
Afghanistan        100
Peru               100
North Macedonia    100
Norway             100
Oman               100
                  ... 
Indonesia          100
Zimbabwe           100
Denmark             72
Guinea-Bissau        8
Finland              4
Name: count, Length: 159, dtype: int64


In [14]:
df_sampled.to_csv("../data/common_crawl_sample.csv", index = False)

In [13]:
combined_df.head()

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,country,pattern,languages,encoding,redirect,truncated
0,"gm,gov)/judiciary",20240720034606,http://www.gov.gm/judiciary/,text/html,text/html,404,2ZF25EIJD3NGU5JS5PWANKTQRE5XTYPY,650,1528852,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Gambia,*.gov.gm,,,,
1,"gm,gov)/robots.txt",20240720034606,http://www.gov.gm/robots.txt,text/html,text/html,404,2ZF25EIJD3NGU5JS5PWANKTQRE5XTYPY,653,399344,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Gambia,*.gov.gm,,,,
2,"gm,gov,digitaladdressing)/",20240718115104,https://digitaladdressing.gov.gm/,text/html,text/html,200,K7MSSRWHMMGTZHLDDBX5NREGYSUGDC5X,30224,186502637,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Gambia,*.gov.gm,eng,UTF-8,,
3,"gm,gov,digitaladdressing)/",20240718212536,https://digitaladdressing.gov.gm/,text/html,text/html,200,GSRB2ESUXBQRZR7CKAOKNFU7V7VN2MZK,30222,178330155,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Gambia,*.gov.gm,eng,UTF-8,,
4,"gm,gov,digitaladdressing)/board-of-directors",20240718113824,https://digitaladdressing.gov.gm/board-of-dire...,text/html,text/html,404,HXDMPBWAFVWOUBAZ7BIEKLZCLFFVJVM4,20234,6818213,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Gambia,*.gov.gm,,,,
