## Exercise with Data Wrangling
The entire code now runs in **76.97 seconds** (original code runs in **2 minutes 49 seconds** on my PC).

### Things I did to make the code faster
- Use `read_fwf` instead of `read_csv`.
- I got rid of all the `for` loops except for one (`for` loops are slow in Python).  
- I check the data for gaps *before* analyzing it and adding it to the master dataframe.

In [1]:
import numpy as np
import pandas as pd
import glob
import time as ti

In [None]:
t0 = ti.time()

fnames = glob.glob("./data/*/*")

# list of breakpoints and column names from ISH_Manual.PDF
colnames = ["time", "temp", "precip"]
colspecs = [(15,27), (87,91), (105, 8193)]

crit_rows = 3 # Maximum allowed missing hours
season_start, season_end = '05-01-', '10-31-'

df_temp_all = pd.DataFrame(columns=["time"])
df_precip_all = pd.DataFrame(columns=["time"])

for name in fnames:
    # Read in data file
    df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, 
                     encoding="latin_1", dtype={'time':str, 'temp':int, 'precip':str})
    
    # Remove duplicate hours, keep only the first measurement per hour
    df = df[df["time"].astype(str).str.slice(4, 10).duplicated(keep="first") == False]
    
    # Add in missing time values (corrects for leap years) and keep only growing season
    year = df["time"].values[0][0:4]
    df.set_index(pd.DatetimeIndex(df["time"]), inplace=True)
    df = df.reindex(pd.date_range(season_start+year, season_end+year, freq="1H"), fill_value=np.nan)
    df["time"] = df.index.astype(str)
    df.reset_index(inplace=True)
    
    # Get precipitation data (or NaN if AA1 is not in extra data section)
    df["precip"] = df[df['precip'].str.find("AA1")!=-1]['precip'].str.split("AA1").str.get(1).str.slice(5, 8)
    
    # Replace placeholder 9999 with NaN values
    df["temp"].replace({9999: np.nan}, inplace=True)
    
    # If there are no gaps bigger than crit_rows, then process data
    if (df.replace(method="ffill", limit=crit_rows).iloc[crit_rows:].isnull().sum().sum() == 0):
        
        # Get the year and site name from the filename
        year_site = name.split("-")[-1]+"_"+name.split("-")[-2]    
        
        # Rename the precipitation and temperature data by year and ID
        temp = df.rename(columns={"temp":year_site}).drop("precip", axis=1)
        precip = df.rename(columns={"precip":year_site}).drop("temp", axis=1)
        
        # Merge the data onto the master dataframes
        df_temp_all = temp.merge(df_temp_all, how="outer", on="time", sort=False)
        df_precip_all = precip.merge(df_precip_all, how="outer", on="time", sort=False)
        
print(ti.time()-t0, "seconds to run this code block.")

In [None]:
df_temp_all.head()

In [None]:
df_precip_all.head()