In [1]:
# load packages 
import os 
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

In [2]:
# load data
timbre_path = os.path.join('data', 'year_prediction.csv')
timbre = pd.read_csv(timbre_path).rename(columns={'label': 'year'})  # rename column label into year
timbre.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,...,TimbreCovariance69,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [3]:
# preprocess 
# extract interesting columns 
timbre_cols = [f"TimbreAvg{i}" for i in range(1, 13)]
timbre_avgs = timbre[["year"] + timbre_cols] 

# since 1923 is missing between 1922 and 1924, we will just start our exploration from year 1924
timbre_avgs = timbre_avgs[timbre_avgs["year"] != 1922]
timbre_avgs_by_year = timbre_avgs.groupby("year").mean().reset_index()
timbre_avgs_by_year.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12
0,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835
1,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128
2,1926,32.57212,-126.464657,88.917274,7.843321,-5.841375,13.394603,-31.141998,9.859703,-17.412415,4.974054,0.624522,-2.053934
3,1927,31.229186,-110.711278,59.213804,4.108315,-9.53837,9.985781,-21.01109,13.410433,-25.118038,4.396292,-0.394336,-2.645019
4,1928,31.499581,-94.417082,27.262832,-8.419641,-16.803719,9.830962,-34.226483,8.103334,-2.779955,7.653416,-0.274048,8.57516


## Heatmap: high level trend of Timbre usage across years

In [4]:
# combine timbre_avgs_by_year into a long dataset for Heatmap creation
timbre_avgs_long = timbre_avgs_by_year.melt("year", var_name = "TimbreType", value_name = "Average Values")

In [5]:
# heatmap, plot average timbre features by year 
alt.Chart(timbre_avgs_long).mark_rect().encode(
    x='year:O',
    y=alt.Y('TimbreType:O', sort=timbre_cols), 
    color='Average Values:Q',
    tooltip=['year', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Years'
)

# TODO: explore what are Timbre2 and Timbre 3 in particular 
# TODO: list a couple more events to explore (like 9/11 yes! this would be great )

### Heatmap Conclusion 
* Probably not the most organized visualization, but it does show some interesting general trends
* Observation 1: Timbre1 is consistently at a relatively high value
* Observation 2: 
    * Timbre2 was very low from 1920s to 1950s, but gradually stabalizes 
    * Timbre3 was very high from 1920s to 1950s, but also gradually stabalizes, to a very similar value as Timbre2 actaully
* Observation 3: with the exception of Timbre1, which is at the relatively high value, all other timbre is becoming more and more uniformly distributed as time period goes on
    * correlate to stability of society??
    * surprisingly: nothing significant for 2001 (9/11) --- maybe look more into the specific year to explore


## Decade-wise Distribution

#### Dataset Filtering: 1930 ~ 2010

In [6]:
# compute decades 
def transform_year_to_decade(year):
    """ convert year into decade """
    return 'd' + str(year)[2] + '0'

# transform and obtain long form data for plotting 
timbre_avgs['decade'] = timbre_avgs.year.transform(transform_year_to_decade)
timbre_avgs_by_decade = timbre_avgs.drop(columns=['year']).groupby('decade').mean().reset_index()
timbre_avgs_by_decade_long = timbre_avgs_by_decade.melt("decade", var_name='TimbreType', value_name='Average Values')

In [7]:
# ### create dataset that shows decade information
# decade_dict = dict()
# # d20 = timbre_avgs[(timbre_avgs["label"] >= 1920) & (timbre_avgs["label"] < 1930)].drop("label", axis = 1).sum()
# d30 = timbre_avgs[(timbre_avgs["label"] >= 1930) & (timbre_avgs["label"] < 1940)].drop("label", axis = 1).mean()
# d40 = timbre_avgs[(timbre_avgs["label"] >= 1940) & (timbre_avgs["label"] < 1950)].drop("label", axis = 1).mean()
# d50 = timbre_avgs[(timbre_avgs["label"] >= 1950) & (timbre_avgs["label"] < 1960)].drop("label", axis = 1).mean()
# d60 = timbre_avgs[(timbre_avgs["label"] >= 1960) & (timbre_avgs["label"] < 1970)].drop("label", axis = 1).mean()
# d70 = timbre_avgs[(timbre_avgs["label"] >= 1970) & (timbre_avgs["label"] < 1980)].drop("label", axis = 1).mean()
# d80 = timbre_avgs[(timbre_avgs["label"] >= 1980) & (timbre_avgs["label"] < 1990)].drop("label", axis = 1).mean()
# d90 = timbre_avgs[(timbre_avgs["label"] >= 1990) & (timbre_avgs["label"] < 2000)].drop("label", axis = 1).mean()
# d00 = timbre_avgs[(timbre_avgs["label"] >= 2000) & (timbre_avgs["label"] < 2010)].drop("label", axis = 1).mean()
# # decade_dict["20s"] = d20
# decade_dict["30s"] = d30
# decade_dict["40s"] = d40
# decade_dict["50s"] = d50
# decade_dict["60s"] = d60
# decade_dict["70s"] = d70
# decade_dict["80s"] = d80
# decade_dict["90s"] = d90
# decade_dict["00s"] = d00

In [8]:
# decade_tim = pd.DataFrame(decade_dict).transpose().reset_index().melt("index", var_name = "TimbreType", value_name = "Average Values")
# # decade_tim

In [9]:
# plot timbre feature averages by decades 
decade_seq = [f'd{s}0' for s in [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]]
alt.Chart(timbre_avgs_by_decade_long).mark_rect().encode(
    x=alt.X('decade:O', sort=decade_seq),
    y=alt.Y('TimbreType:O', sort=timbre_cols),
    color='Average Values:Q',
    tooltip=['decade', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Decade'
)

In [10]:
# TODO: add observations 