In [19]:
import pandas as pd
import numpy as np

In [None]:
temps = pd.read_csv("../data/nyc-temps.txt", header=None).squeeze()
print(temps.count())
df = pd.DataFrame(
    {"temp": temps, "hour": [0, 3, 6, 9, 12, 15, 18, 21] * (temps.count() // 8) + [0]}
)
df.describe()

729


Unnamed: 0,temp,hour
count,729.0,729.0
mean,-1.050754,10.485597
std,5.022904,6.884855
min,-14.0,0.0
25%,-4.0,3.0
50%,0.0,9.0
75%,2.0,15.0
max,12.0,21.0


In [None]:
# Set it 3am and 6am readings to be NaN
df.loc[df["hour"].isin([3, 6]), "temp"] = np.nan
df["temp"].describe()

count    547.000000
mean      -1.049360
std        5.023328
min      -14.000000
25%       -4.000000
50%        0.000000
75%        2.000000
max       12.000000
Name: temp, dtype: float64

In [22]:
df2 = df.interpolate()
df2.describe()

Unnamed: 0,temp,hour
count,729.0,729.0
mean,-1.050754,10.485597
std,5.022904,6.884855
min,-14.0,0.0
25%,-4.0,3.0
50%,0.0,9.0
75%,2.0,15.0
max,12.0,21.0


# Extension questions
1. How does the behaviour of interpolate change if you use `method='nearest'`?
2. Assume the equipment works fine around the clock but fails to record readings a -1 degrees and below. Are the interpolated values similar to the real (missing) values they replace? Why or why not?
3. A cheap solution to interpolation is to replace `NaN` values with the column's mean. Do this (with the missing values from -1 and below), and compare the new mean and median. Again, why are (or aren't) these values similar to the original ones?

In [None]:
# 1. Changing interpolation method to nearest
df2 = df.interpolate(method="nearest")
df2.describe()
# doesn't seem to change at all for this data

Unnamed: 0,temp,hour
count,729.0,729.0
mean,-1.050754,10.485597
std,5.022904,6.884855
min,-14.0,0.0
25%,-4.0,3.0
50%,0.0,9.0
75%,2.0,15.0
max,12.0,21.0


In [26]:
# 2. Modify data to fail for all negative values
temps = pd.read_csv("../data/nyc-temps.txt", header=None).squeeze()
dfneg = pd.DataFrame(
    {"temp": temps, "hour": [0, 3, 6, 9, 12, 15, 18, 21] * (temps.count() // 8) + [0]}
)
dfneg.loc[dfneg["temp"] <= -1, "temp"] = np.nan
dfneg.describe()

Unnamed: 0,temp,hour
count,377.0,729.0
mean,2.763926,10.485597
std,2.691589,6.884855
min,0.0,0.0
25%,1.0,3.0
50%,2.0,9.0
75%,5.0,15.0
max,12.0,21.0


In [None]:
# now do linear interpolation on the `dfneg` values
dfneg.interpolate().describe()
# this is obviously a dumb way to do things, since it shifts everything to be more positive

Unnamed: 0,temp,hour
count,721.0,729.0
mean,2.022191,10.485597
std,2.345483,6.884855
min,0.0,0.0
25%,0.209524,3.0
50%,1.0,9.0
75%,3.0,15.0
max,12.0,21.0


In [None]:
# 3. Replacing the `NaN` values with the column mean (also dumb) for the same reasons as above
dfneg.fillna(dfneg.mean()).describe()

Unnamed: 0,temp,hour
count,729.0,729.0
mean,2.763926,10.485597
std,1.934359,6.884855
min,0.0,0.0
25%,2.0,3.0
50%,2.763926,9.0
75%,2.763926,15.0
max,12.0,21.0
