<a href="https://colab.research.google.com/github/yaobviously/perkins_temp/blob/main/perkins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
pop_csv = '/content/drive/MyDrive/Temperature_Perkins/Population Data.csv'
temp_csv = '/content/drive/MyDrive/Temperature_Perkins/Temperature Data.csv'

In [98]:
def load_transform_data():
  """
  Preparing the temperature and population datasets for visualization and 
  analysis
  """

  temp_df = pd.read_csv(temp_csv, parse_dates=['location_date'])

  pop_df = pd.read_csv(pop_csv)

  # case normalizing pop columns
  pop_df.columns = pop_df.columns.str.lower()

  # dropping redundant columns
  col_drop = ['country_name', 'country_code', 'continent']
  temp_df = temp_df.drop(col_drop, axis=1)

  # renaming columns
  temp_df = temp_df.rename(columns={'temp_mean_c': 'mean_temp',
                                    'temp_min_c' : 'min_temp',
                                    'temp_max_c' : 'max_temp'})

  # making it easier to merge later
  temp_df['name'] = [x.split('/')[0] for x in temp_df['name']]
  temp_df = temp_df.rename(columns={'name':'city'})

  # creating a city dictionary to find matching city pops. i'm replacing 
  # Albany with Syracuse because it seems closest in terms of pop (for weighting),
  # comparable in terms of long/lat, and i'm restricted to this dataset

  city_dict = {'St Louis' : 'St. Louis', 'NYC' : 'New York', "Chicago O'Hare" : 'Chicago',
               'Covington': 'Cincinnati', 'Wash DC' : 'Washington', 'Windsor Locks' : 'Hartford',
               'Albany' : 'Syracuse'
               }
  temp_df['city'] = temp_df['city'].map(city_dict).fillna(temp_df['city'])

  # merging the dataframes and fixing the Albany hack above

  merged_df = pd.merge(temp_df, pop_df, how='left', on='city')
  merged_df = merged_df.set_index('location_date').sort_index()
  
  merged_df['city'] = merged_df['city'].replace('Syracuse', 'Albany')

  # creating a function to find the dates with missing values for each
  # station and then appending all of those dates to the main dataframe

  days = merged_df.index.unique()

  def find_miss_days(station_code = 'KLIT'):

    df_ = merged_df[merged_df['station_code'] == station_code]

    indexes = set(df_.index)
    dates_missing = set(days).difference(indexes)

    val_dict = {'city' : df_['city'][0],
                'station_code' : station_code,
                'state' : df_['state'][0]
                }

    new_df = pd.DataFrame(val_dict, index=tuple(dates_missing))

    return new_df

  df_append = [find_miss_days(code) for code in merged_df.station_code.unique()]
  df_concat = pd.concat(df_append)

  df = merged_df.append(df_concat)
  
  # extracting month/year from index to compute monthly expanding averages for each 
  # location
  df['month'] = pd.DatetimeIndex(df.index).month
  df['year'] = pd.DatetimeIndex(df.index).year
  df['avg_daily_mean_month'] = (df
                               .groupby(['city', 'month'])['mean_temp']
                               .transform(lambda x: x.expanding().mean()
                                ))
  
  df['avg_daily_max_month'] = (df
                              .groupby(['city', 'month'])['max_temp']
                              .transform(lambda x: x.expanding().mean()
                              ))
  
  df['avg_daily_min_month'] = (df
                              .groupby(['city', 'month'])['min_temp']
                              .transform(lambda x: x.expanding().mean()
                                 ))
  
  df['this_month_high_for_loc'] = (df
                                  .groupby(['city', 'year', 'month'])['max_temp']
                                  .transform(lambda x: x.expanding().max()))
  df['this_month_low_for_loc'] = (df
                                  .groupby(['city', 'year', 'month'])['min_temp']
                                  .transform(lambda x: x.expanding().min()))
  
  df = df.sort_index()
  
  # filling in missing values - more advanced options are possible
  df = df.set_index('city', append=True)
  
  df = df.groupby(level=1).ffill()
  df = df.reset_index(level=1)  
  
  df = df.round(2)  
  return df


In [99]:
df = load_transform_data()

print("The shape of the new dataframe is:", df.shape)

The shape of the new dataframe is: (94353, 16)


In [103]:
def monthly_details(city='Sacramento'):
  
  
  df_ = df[df['city'] == city]
  return sns.lineplot(y = 'avg_daily_mean_month')

ValueError: ignored

In [113]:
def plot_city(city = 'Sacramento', duration='W'):
  """
  A simple plotting function that takes a city and a duration and displays a graph
  of the average of the mean temp, max temp, and min temp over that time span

  Parameters
  ----------
    City : str
    duration: str ("D", "W", "M", "Y")

  Returns
  -------
    Graph: matplot plot
  """

  cols = ['mean_temp', 'max_temp', 'min_temp']

  df_ = df[df['city'] == city]

  return df_[cols].resample(duration).mean().plot(figsize=(24,5), grid=True, legend=True);

In [None]:
# showing that it works

plot_city()

In [82]:
def estimate_pop_(date = '2020-05-01', temp = 25):
  """
  Takes a date and a temperature and tells you how many residents in the 
  dataset experienced temperatures above the entered value on that date

  Parameters
  ----------
    date: datetime
    temp: int

  Returns
  -------
    total population: float
  """

  df_ = df[(df.index == date) & (df['max_temp'] >= temp)]
  return df_.population.sum()
