## Exercise with Data Wrangling

### Things I did to make the code faster
- Use `read_fwf` instead of `read_csv`.
- I got rid of all the `for` loops except for one (`for` loops are slow in Python).  
- I check the data for gaps *before* analyzing it and adding it to the master dataframe.
- The entire code now runs in **XX minutes XX seconds** (original code runs in **XX minutes XX seconds** on my PC).

In [None]:
import numpy as np
import pandas as pd
import glob

In [None]:
%%timeit -n 1 -r 1

fnames = glob.glob("./weafiles/*/*")

# list of breakpoints and column names from ISH_Manual.PDF
colnames = ["time", "M_D_H", "temp", "precip"]
colspecs = [(15,27), (19, 25), (87,91), (105, 1000000)]

crit_rows = 3 # Maximum allowed missing hours
growseason = pd.date_range(start='2000-05-01', end='2000-10-31').strftime('%m-%d')

df_temp_all = pd.DataFrame(columns=["time"])
df_precip_all = pd.DataFrame(columns=["time"])

for name in fnames:
    # Read in data file with time strings preserving leading 0 characters
    df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, 
                     encoding="latin_1", dtype={'time': object, 'M_D_H': object})
    
    # Keep only rows where month and day are in growing season
    df = df[pd.DatetimeIndex(df["time"]).strftime('%m-%d').isin(growseason)]
    
    # Remove duplicate hours, keep only the first measurement per hour
    df.drop_duplicates(subset="M_D_H", keep="first", inplace=True)
    
    # Get precipitation data (or NaN if AA1 is not in extra data section)
    df["precip"] = df[df['precip'].str.find("AA1")!=-1]['precip'].str.split("AA1").str.get(1).str.slice(5, 8)
    
    # Replace placeholder 9999 with NaN values
    df["temp"].astype(str).replace({'9999': np.nan}, inplace=True)
    
    # If there are no gaps bigger than crit_rows, then process data
    if df.replace(method="ffill", limit=crit_rows).iloc[crit_rows:].isnull().sum().sum() == 0:
        # Get the year and site name from the filename
        year_site = name.split("-")[-1]+"_"+name.split("-")[-2]    
        
        # Rename the precipitation and temperature data by year and ID
        temp = pd.DataFrame({year_site: df["temp"].astype(float), "time": df["time"].astype(int)})
        precip = pd.DataFrame({year_site: df["precip"].astype(float), "time": df["time"].astype(int)})
        
        # Merge the data onto the master dataframes
        df_temp_all = temp.merge(df_temp_all, how="outer", on="time", sort=False)
        df_precip_all = precip.merge(df_precip_all, how="outer", on="time", sort=False)