In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(env, maple):
    env_df = pd.read_csv(env).copy()
    maple_df = pd.read_csv(maple).copy()

    maple_df.drop("STATE", axis=1, inplace=True)
    env_df.rename(columns={"DATE": "YEAR"}, inplace=True)

    cols_to_add = set(list(maple_df.columns) + list(env_df.columns))
    df = pd.DataFrame(columns=cols_to_add)

    return env_df, maple_df, df

def merge_data(env, maple, merged):
    # group each dataframe by county and year
    maple_groupby = maple.groupby(["COUNTY", "YEAR"])
    env_groupby = env.groupby(["COUNTY", "YEAR"])

    # loop through all the counties
    for county in maple["COUNTY"].unique():
        # get the county data
        county_data = maple[maple["COUNTY"] == county]
        for year in county_data["YEAR"].unique():
            # check if there's env data for that county and year
            if county.capitalize() in env["COUNTY"].unique() and year in env["YEAR"].unique():
                # get the related environmental data
                env_group = env_groupby.get_group((county.capitalize(), year))
                # get the correct maple syrup data
                maple_group = maple_groupby.get_group((county, year))

                # construct the merged row
                new_row = dict()
                for col in env.columns:
                    new_row[col] = env_group[col].values[0]
                for col in maple.columns:
                    new_row[col] = maple_group[col].values[0]

                # add to the merged dataframe
                merged = merged.append(new_row, ignore_index=True)
    
    return merged

def reorder_columns(df):
    cols = ['YEAR', 'COUNTY', 'OPERATIONS WITH TAPS', 'OPERATIONS WITH SALES', 'NUMBER OF TAPS', 'SALES', 'PRODUCTION', 'TAVG', 'FZF9', 'HTDD', 'DX32', 'FZF5', 'EMXP', 'SNOW', 'TMIN', 'DX70', 'FZF0', 'EMNT', 'FZF4', 'FZF6', 'FZF2', 'DX90', 'EMXT', 'CDSD', 'PRCP', 'FZF7', 'TMAX', 'FZF8', 'DT32', 'DT00', 'CLDD', 'FZF3', 'FZF1']
    df = df[cols]
    return df


In [3]:
env_data = "../../data/cleaned/environment/environment predictions.csv"
maple_data = "../../data/cleaned/maple/cleaned county data.csv"

merged_data = merge_data(*load_data(env_data, maple_data))
merged_data = reorder_columns(merged_data)

merged_data.to_csv("../../data/cleaned/merged data.csv", index=False)