In [None]:
import numpy as np
import pandas as pd
import re
from os import listdir
from os.path import isfile, join
import warnings
warnings.simplefilter(action = "ignore", category = Warning)

In [None]:
frequency = "daily"
files = [f for f in listdir(f"raw_data/{frequency}/") if isfile(join(f"raw_data/{frequency}/", f)) and f.lower().endswith(".csv")]
files.sort()

In [None]:
def find_header_row(full_path):
    try:
        header_pattern = re.compile(r"^,[ \w-]+.*$")
        with open(full_path, "r") as f:
            for i, line in enumerate(f):
                if header_pattern.match(line.strip()):
                    return i
    except Exception as e:
        print("Failed to find header row in", full_path)
        print(e)

In [None]:
def clean_data(df, daily = False):
    date_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else " "
    df[date_col] = df[date_col].astype(str).str.strip()
    if daily:
        df = df[df[date_col].str.len() > 6]
    else:
        df = df[df[date_col].str.len() > 4]
    df[date_col] = df[date_col].astype(str)
    df["Year"] = df[date_col].str[:4]
    df["Month"] = df[date_col].str[4:6]
    if daily:
        df["Day"] = df[date_col].str[6:]
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors = "coerce")
    df = df.drop(columns = [date_col])
    df = df.dropna()
    df["Year"] = df["Year"].astype(int)
    df["Month"] = df["Month"].astype(int)
    if daily:
        df["Day"] = df["Day"].astype(int)
    return df

In [None]:
def get_clean_data(data_path):
    full_path = f"raw_data/{frequency}/" + data_path
    header_row = find_header_row(full_path)
    try:
        data = pd.read_csv(full_path, skiprows = header_row, na_values = ["-99.99", "-999", -99.99, -999])
        df = clean_data(data, frequency == "daily")
        return df
    except Exception as e:
        print("Failed to read CSV for", data_path, "expected header row was", header_row)
        print(e)

In [None]:
def get_df_name_prefix(name):
    name = name.lower()
    if "industry_portfolios" in name:
        return "us_" + re.match(r"^(\d+)_", name).group(1)
    elif "25_portfolios" in name:
        if name == "25_portfolios":
            return "na_100"
        elif "developed_25_portfolios" in name:
            return "dev_100"
        elif "ex_us" in name:
            return "dev_exUS_100"
        elif "europe" in name:
            return "euro_100"
        elif "japan" in name:
            return "jap_100"
        elif "asia_pacific" in name:
            return "asia_100"
    elif "100_portfolios" in name:
        return "us_100"
    elif "data_factors" in name:
        return "us_ff3"
    elif "5_factors" in name:
        if "f-f" in name:
            return "us_ff5"
        elif "america" in name:
            return "na_ff5"
        elif "developed_5_factors" in name:
            return "dev_ff5"
        elif "ex_us" in name:
            return "dev_exUS_ff5"
        elif "europe" in name:
            return "euro_ff5"
        elif "japan" in name:
            return "jap_ff5"
        elif "asia_pacific" in name:
            return "asia_ff5"

In [None]:
def split_and_save(df, prefix, start_years, end_year = 2019):
    for start_year in start_years:
        sub_df = df[(df["Year"] >= start_year) & (df["Year"] <= end_year)]
        sub_df.to_csv(f"clean_data/{frequency}/{prefix}_{start_year}.csv")

In [None]:
def process_data(data_path):
    global curr_25_combo
    global curr_25_name
    
    combo_25_pattern = r"^(.*?25_Portfolios)"
    match_25 = re.search(combo_25_pattern, data_path)
    if match_25:
        curr_25_name = match_25.group(1)
        suffix_pattern = r"25_Portfolios(.*?)(?:\d+x\d+[_]*)?(?:\.csv|\.CSV)$"
        suffix = re.search(suffix_pattern, data_path).group(1)
        if suffix == "_":
            suffix = "5x5"
        if curr_25_combo is None:
            curr_25_combo = get_clean_data(data_path)
        else:
            df = get_clean_data(data_path)
            curr_25_combo = pd.merge(curr_25_combo, df, on = ["Year", "Month"], suffixes = (None, suffix))
    elif ("MOM" not in data_path) and ("Momentum" not in data_path):
        if curr_25_combo is not None:
            prefix = get_df_name_prefix(curr_25_name)
            assert prefix is not None, f"oopsie, {data_path}"
            split_and_save(curr_25_combo, prefix, [1991, 2008])
        curr_25_combo = None
        curr_25_name = None
        df = get_clean_data(data_path)
        prefix = get_df_name_prefix(data_path)
        assert prefix is not None, f"whoopsie, {data_path}"
        split_and_save(df, prefix, [1964, 1991, 2008])

In [None]:
curr_25_combo = None
curr_25_name = None

def process_loop():
    for file in files:
        process_data(file)

In [None]:
def combine_ff5_and_mom(clean_files):
    for file in files:
        name = file.lower()
        if "mom" in name:
            if "f-f" in name:
                tag = "us_ff5"
            elif "america" in name:
                tag = "na_ff5"
            elif "developed_mom_factor" in name:
                tag = "dev_ff5"
            elif "ex_us" in name:
                tag = "dev_exUS_ff5"
            elif "europe" in name:
                tag = "euro_ff5"
            elif "japan" in name:
                tag = "jap_ff5"
            elif "asia_pacific" in name:
                tag = "asia_ff5"
            momentum_df = get_clean_data(file)
            factors_data = [f for f in clean_files if tag in f]
            for f in factors_data:
                factors_df = pd.read_csv(f"clean_data/{frequency}/" + f)
                year = f.replace(".csv", "")[-4:]
                combined = pd.merge(factors_df, momentum_df, on = ["Year", "Month"], suffixes = (None, "mom"))
                combined.to_csv(f"clean_data/{frequency}/{tag}_mom_{year}.csv")

In [None]:
process_loop()

In [None]:
clean_files = [f for f in listdir(f"clean_data/{frequency}/") if isfile(join(f"clean_data/{frequency}/", f)) and f.lower().endswith(".csv")]
clean_files.sort()
combine_ff5_and_mom(clean_files)

Large files:
```
remote: warning: File clean_data/dev_100_1991.csv is 52.62 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
remote: warning: File clean_data/dev_exUS_100_1991.csv is 52.37 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
remote: warning: File clean_data/euro_100_1991.csv is 51.17 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
remote: warning: File clean_data/jap_100_1991.csv is 50.88 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
remote: error: Trace: 5422aa00d681579329a15dff13d01d53b484fcbcb4550009bfc124675159c91d
remote: error: See https://gh.io/lfs for more information.
remote: error: File clean_data/na_100_2008.csv is 290.15 MB; this exceeds GitHub's file size limit of 100.00 MB
remote: error: File clean_data/na_100_1991.csv is 698.29 MB; this exceeds GitHub's file size limit of 100.00 MB
```

In [None]:
large_files = {
    "clean_data/dev_100_1991.csv": 2,
    "clean_data/dev_exUS_100_1991.csv": 2,
    "clean_data/euro_100_1991.csv": 2,
    "clean_data/jap_100_1991.csv": 2,
    "clean_data/na_100_2008.csv": 6,
    "clean_data/na_100_1991.csv": 14
}
for lf, num_splits in large_files.items():
    file_prefix = lf.replace("clean_data/", "").replace(".csv", "")
    df = pd.read_csv(lf)
    rows_per_split = len(df) // num_splits
    for i in range(num_splits):
        if i == num_splits - 1:
            split_df = df.iloc[i * rows_per_split:]
        else:
            split_df = df.iloc[i * rows_per_split : (i + 1) * rows_per_split]
        split_df.to_csv(f"clean_data/{file_prefix}_{i + 1}.csv")