In [1]:
# Import dependencies and setup pathlib to load csv files
import pandas as pd
from pathlib import Path
import random
import matplotlib.pyplot as plt
import scipy.stats as st

In [8]:
# Load csv dataset file from 'Resources' folder
load_iram_data = Path("Resources/iram_data.csv")

# Read iram_data csv file and store dataset into new Pandas DataFrame
iram_df = pd.read_csv(load_iram_data, index_col = False)

iram_df

Unnamed: 0,indicator_id,region_id,date,value,region_type,region,City,State
0,IRAM,845172,2023-08-31,153.0,metro,"Winfield, KS",Winfield,KS
1,IRAM,845172,2023-07-31,143.0,metro,"Winfield, KS",Winfield,KS
2,IRAM,845172,2023-06-30,132.0,metro,"Winfield, KS",Winfield,KS
3,IRAM,845172,2023-05-31,117.0,metro,"Winfield, KS",Winfield,KS
4,IRAM,845172,2023-04-30,103.0,metro,"Winfield, KS",Winfield,KS
...,...,...,...,...,...,...,...,...
60915,IRAM,394297,2018-07-31,294.0,metro,"Aberdeen, SD",Aberdeen,SD
60916,IRAM,394297,2018-06-30,263.0,metro,"Aberdeen, SD",Aberdeen,SD
60917,IRAM,394297,2018-05-31,231.0,metro,"Aberdeen, SD",Aberdeen,SD
60918,IRAM,394297,2018-04-30,205.0,metro,"Aberdeen, SD",Aberdeen,SD


In [9]:
# According to the United States Census Bureau, the USA can be broken down into 4 Regions.
# Source: https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

# Northeast Region = 9 States
region_northeast = ('CT', 'MA', 'ME', 'NH', 'NJ', 'NY', 'PA', 'RI', 'VT')

# Midwest Region = 12 States
region_midwest = ('IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MO', 'ND', 'NE', 'OH', 'SD', 'WI')

# South Region = 17 States (including District of Columbia i.e. DC)
region_south = ('AL', 'AR', 'DC', 'DE', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'OK', 'SC', 'TN', 'TX', 'VA', 'WV')

# West Region = 13 States
region_west = ('AK', 'AZ', 'CA', 'CO', 'HI', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY')


iram_df.loc[iram_df['State'].isin(region_northeast), 'Census Region'] = "Northeast"

iram_df.loc[iram_df['State'].isin(region_midwest), 'Census Region'] = "Midwest"

iram_df.loc[iram_df['State'].isin(region_south), 'Census Region'] = "South"

iram_df.loc[iram_df['State'].isin(region_west), 'Census Region'] = "West"

iram_df

Unnamed: 0,indicator_id,region_id,date,value,region_type,region,City,State,Census Region
0,IRAM,845172,2023-08-31,153.0,metro,"Winfield, KS",Winfield,KS,Midwest
1,IRAM,845172,2023-07-31,143.0,metro,"Winfield, KS",Winfield,KS,Midwest
2,IRAM,845172,2023-06-30,132.0,metro,"Winfield, KS",Winfield,KS,Midwest
3,IRAM,845172,2023-05-31,117.0,metro,"Winfield, KS",Winfield,KS,Midwest
4,IRAM,845172,2023-04-30,103.0,metro,"Winfield, KS",Winfield,KS,Midwest
...,...,...,...,...,...,...,...,...,...
60915,IRAM,394297,2018-07-31,294.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest
60916,IRAM,394297,2018-06-30,263.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest
60917,IRAM,394297,2018-05-31,231.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest
60918,IRAM,394297,2018-04-30,205.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest


In [14]:
# Going by Meterological Seasons
# Source: https://www.timeanddate.com/calendar/aboutseasons.html

iram_df["date"] = pd.to_datetime(iram_df["date"])


iram_df.loc[iram_df["date"].dt.month.isin([3, 4, 5]), "Season"] = "Spring"

iram_df.loc[iram_df["date"].dt.month.isin([6, 7, 8]), "Season"] = "Summer"

iram_df.loc[iram_df["date"].dt.month.isin([9, 10, 11]), "Season"] = "Autumn"

iram_df.loc[iram_df["date"].dt.month.isin([12, 1, 2]), "Season"] = "Winter"

iram_df["Year"] = iram_df["date"].dt.year


iram_df

Unnamed: 0,indicator_id,region_id,date,value,region_type,region,City,State,Census Region,Season,Year
0,IRAM,845172,2023-08-31,153.0,metro,"Winfield, KS",Winfield,KS,Midwest,Summer,2023
1,IRAM,845172,2023-07-31,143.0,metro,"Winfield, KS",Winfield,KS,Midwest,Summer,2023
2,IRAM,845172,2023-06-30,132.0,metro,"Winfield, KS",Winfield,KS,Midwest,Summer,2023
3,IRAM,845172,2023-05-31,117.0,metro,"Winfield, KS",Winfield,KS,Midwest,Spring,2023
4,IRAM,845172,2023-04-30,103.0,metro,"Winfield, KS",Winfield,KS,Midwest,Spring,2023
...,...,...,...,...,...,...,...,...,...,...,...
60915,IRAM,394297,2018-07-31,294.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest,Summer,2018
60916,IRAM,394297,2018-06-30,263.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest,Summer,2018
60917,IRAM,394297,2018-05-31,231.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest,Spring,2018
60918,IRAM,394297,2018-04-30,205.0,metro,"Aberdeen, SD",Aberdeen,SD,Midwest,Spring,2018


In [18]:
# Group main DataFrame by 'Seasons' and store in a new DataFrame
groupby_seasons_df = iram_df.groupby("Season")

# Using values from the 'Tumor Volume (mm3)' in the grouped DataFrame (by 'Drug Regimen')...
# Calculate Summary Statistics (Mean / Median / Variance / Std Deviation / Std Error of Mean) and store in new DataFrame
# '.agg()' Method is used to call multiple functions in one line of code
seasons_summary = groupby_seasons_df["value"].agg(["mean", "median","var", "std", "sem"])

# Rename columns with appropriate headings in the Summary Statistics DataFrame
seasons_summary.columns = ["Active Property Sales Mean",
                           "Active Property Sales Median",
                           "Active Property Sales Variance",
                           "Active Property Sales Std. Dev.",
                           "Active Property Sales Std. Err."]

# Display Summary Statistics DataFrame
seasons_summary

Unnamed: 0_level_0,Active Property Sales Mean,Active Property Sales Median,Active Property Sales Variance,Active Property Sales Std. Dev.,Active Property Sales Std. Err.
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Autumn,1475.165566,377.0,21186000.0,4602.825592,38.992997
Spring,1169.649508,303.0,14742890.0,3839.647041,29.915191
Summer,1307.332726,347.0,17564780.0,4191.035564,32.665795
Winter,1275.860793,308.0,16620290.0,4076.799059,34.392654
