## Exercise with Data Wrangling
The entire code now runs in **18.669 seconds** (original code runs in **166.24 seconds** on my PC).

### Things I did to make the code faster
- Use `read_fwf` instead of `read_csv`.
- I got rid of all the `for` loops except for one (`for` loops are slow in Python).  
- I check the data for gaps *before* analyzing it and adding it to the master dataframe.

In [1]:
import numpy as np
import pandas as pd
import glob
import time as ti

In [2]:
t0 = ti.time()

In [3]:
# list of breakpoints and column names from ISH_Manual.PDF
years = np.arange(1961, 1966)
dateparse = lambda dates: [pd.datetime.strptime(d, "%Y%m%d%H") for d in dates]

colnames = ["time", "temp", "precip"]
colspecs = [(15,25), (87,91), (105, 8193)]

crit_rows = 3 # Maximum allowed missing hours
season_start, season_end = '05-01-', '10-31-'

df_temp_all = pd.DataFrame(columns=["time"])
df_precip_all = pd.DataFrame(columns=["time"])

for year in years:
    times = pd.date_range(season_start+str(year), season_end+str(year), freq="1H")
    fnames = glob.glob("./data/"+str(year)+"/*")
    for name in fnames:
        # Read in data file
        df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, index_col="time",
                         encoding="latin_1", dtype={'temp':int, 'precip':str}, 
                         parse_dates=True, date_parser=dateparse)

        # Remove duplicate hours, keep only the first measurement per hour
        df = df[df.index.duplicated(keep="first") == False]
        
        # Add in missing time values (corrects for leap years) and keep only growing season
        df = df.reindex(times, fill_value=np.nan)
        
        # Get precipitation data (or NaN if AA1 is not in extra data section)
        df["precip"] = df[df['precip'].str.find("AA1")!=-1]['precip'].str.split("AA1").str.get(1).str.slice(5, 8)

        # Replace placeholder 9999 with NaN values
        df["temp"].replace({9999: np.nan}, inplace=True)

        # If there are no gaps bigger than crit_rows, then process data
        if df.replace(np.nan, 'X', limit=crit_rows).iloc[crit_rows:].isnull().sum().sum() == 0:

            # Get the year and site name from the filename
            year_site = name.split("-")[-1]+"_"+name.split("-")[-2]    

            # Rename the precipitation and temperature data by year and ID
            temp = pd.DataFrame({year_site:df["temp"].astype(float), "time":df.index.values}) 
            precip = pd.DataFrame({year_site:df["precip"].astype(float), "time":df.index.values}) 

            # Merge the data onto the master dataframes
            df_temp_all = temp.merge(df_temp_all, how="outer", on="time", sort=False)
            df_precip_all = precip.merge(df_precip_all, how="outer", on="time", sort=False)

In [4]:
print(ti.time()-t0, "seconds to run this code.")

18.669329404830933 seconds to run this code.


In [5]:
df_temp_all.head()

Unnamed: 0,1965_13962,time,1965_23129,1965_25713,1965_13722,1965_25308,1965_26411,1965_13723,1965_26615,1965_13889,...,1961_12924,1961_13959,1961_23044,1961_23034,1961_26411,1961_13962,1961_23023,1961_25308,1961_03940,1961_25501
0,25.0,1965-05-01 00:00:00,20.0,2.0,20.0,,3.0,21.0,,21.0,...,,,,,,,,,,
1,,1965-05-01 01:00:00,,,17.0,10.0,,,1.0,20.0,...,,,,,,,,,,
2,,1965-05-01 02:00:00,,,16.0,,,,,18.0,...,,,,,,,,,,
3,17.0,1965-05-01 03:00:00,15.0,2.0,15.0,,6.0,17.0,,17.0,...,,,,,,,,,,
4,,1965-05-01 04:00:00,,,13.0,9.0,,,1.0,17.0,...,,,,,,,,,,


In [6]:
df_precip_all.head()

Unnamed: 0,1965_13962,time,1965_23129,1965_25713,1965_13722,1965_25308,1965_26411,1965_13723,1965_26615,1965_13889,...,1961_12924,1961_13959,1961_23044,1961_23034,1961_26411,1961_13962,1961_23023,1961_25308,1961_03940,1961_25501
0,95.0,1965-05-01 00:00:00,,395.0,95.0,,95.0,95.0,,95.0,...,,,,,,,,,,
1,,1965-05-01 01:00:00,,,95.0,95.0,,,,95.0,...,,,,,,,,,,
2,,1965-05-01 02:00:00,,,95.0,,,,,95.0,...,,,,,,,,,,
3,95.0,1965-05-01 03:00:00,,395.0,95.0,,95.0,95.0,,95.0,...,,,,,,,,,,
4,,1965-05-01 04:00:00,,,95.0,95.0,,,,95.0,...,,,,,,,,,,
