In [23]:
# load packages 
import os 
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
from altair_saver import save

In [24]:
# load data
timbre_path = os.path.join('data', 'year_prediction.csv')
timbre = pd.read_csv(timbre_path).rename(columns={'label': 'year'})  # rename column label into year
timbre.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,...,TimbreCovariance69,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


## Heatmap: high level trend of Timbre usage across years

In [25]:
# preprocess 
# extract interesting columns 
timbre_cols = [f"TimbreAvg{i}" for i in range(1, 13)]
timbre_avgs = timbre[["year"] + timbre_cols] 

# since 1923 is missing between 1922 and 1924, we will just start our exploration from year 1924
timbre_avgs = timbre_avgs[timbre_avgs["year"] != 1922]
timbre_avgs_by_year = timbre_avgs.groupby("year").mean().reset_index()
timbre_avgs_by_year.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12
0,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835
1,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128
2,1926,32.57212,-126.464657,88.917274,7.843321,-5.841375,13.394603,-31.141998,9.859703,-17.412415,4.974054,0.624522,-2.053934
3,1927,31.229186,-110.711278,59.213804,4.108315,-9.53837,9.985781,-21.01109,13.410433,-25.118038,4.396292,-0.394336,-2.645019
4,1928,31.499581,-94.417082,27.262832,-8.419641,-16.803719,9.830962,-34.226483,8.103334,-2.779955,7.653416,-0.274048,8.57516


In [26]:
# combine timbre_avgs_by_year into a long dataset for Heatmap creation
timbre_avgs_long = timbre_avgs_by_year.melt("year", var_name = "TimbreType", value_name = "Average Values")

In [27]:
# heatmap, plot average timbre features by year 
yearly_timbre = alt.Chart(timbre_avgs_long).mark_rect().encode(
    x='year:O',
    y=alt.Y('TimbreType:O', sort=timbre_cols), 
    color='Average Values:Q',
    tooltip=['year', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Years',
    width=2000, height=300
)


In [28]:
# TODO: explore what are Timbre2 and Timbre 3 in particular (I think I know?; link in parent README.md)
    # first dimension represents the average loudness of the segment; 
    # second emphasizes brightness; 
    # third is more closely correlated to the flatness of a sound
    # fourth to sounds with a stronger attack

In [29]:
# TODO: how to save altair plot in high resolution? 
    # pip install aitair_saver
    # https://altair-viz.github.io/user_guide/saving_charts.html
    # not to sure why this is not working
# yearly_timbre.save("yearly_timbre.png", scale_factor=2.0)

### Heatmap Conclusion 
* Probably not the most organized visualization, but it does show some interesting general trends
* Observation 1: Timbre1 is consistently at a relatively high value
* Observation 2: 
    * Timbre2 was very low from 1920s to 1950s, but gradually stabalizes 
    * Timbre3 was very high from 1920s to 1950s, but also gradually stabalizes, to a very similar value as Timbre2 actaully
* Observation 3: with the exception of Timbre1, which is at the relatively high value, all other timbre is becoming more and more uniformly distributed as time period goes on
    * correlate to stability of society??
    * surprisingly: nothing significant for 2001 (9/11) --- maybe look more into the specific year to explore


## Decade-wise Distribution

#### Dataset Filtering: 1930 ~ 2010

In [30]:
# compute decades 
def transform_year_to_decade(year):
    """ convert year into decade """
    return 'd' + str(year)[2] + '0'

# transform and obtain long form data for plotting 
timbre_avgs['decade'] = timbre_avgs.year.transform(transform_year_to_decade)
timbre_avgs_by_decade = timbre_avgs.drop(columns=['year']).groupby('decade').mean().reset_index()
timbre_avgs_by_decade_long = timbre_avgs_by_decade.melt("decade", var_name='TimbreType', value_name='Average Values')

In [31]:
# plot timbre feature averages by decades 
decade_seq = [f'd{s}0' for s in [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]]
alt.Chart(timbre_avgs_by_decade_long).mark_rect().encode(
    x=alt.X('decade:O', sort=decade_seq),
    y=alt.Y('TimbreType:O', sort=timbre_cols),
    color='Average Values:Q',
    tooltip=['decade', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Decade'
)

In [12]:
# TODO: add observations to the decade scale  




#### Heatmap of Covariance

In [32]:
# TODO: add a heatmap plotting the covariance of the 12 columns (using the year scale, not the decade scale; pick either lower triangular or upper)
covar = pd.concat([timbre[["year"]], timbre.iloc[:, 13:len(timbre.columns)]], axis = 1)    # dataframe of all covariances
covar.head()


Unnamed: 0,year,TimbreCovariance1,TimbreCovariance2,TimbreCovariance3,TimbreCovariance4,TimbreCovariance5,TimbreCovariance6,TimbreCovariance7,TimbreCovariance8,TimbreCovariance9,...,TimbreCovariance69,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78
0,2001,10.20556,611.10913,951.0896,698.11428,408.98485,383.70912,326.51512,238.11327,251.42414,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,44.38997,2056.93836,605.40696,457.41175,777.15347,415.6488,746.47775,366.4532,317.82946,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,7.46586,699.54544,1016.00954,594.06748,355.73663,507.39931,387.6991,287.15347,112.37152,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,37.64085,2174.08189,697.43346,459.24587,742.78961,229.30783,387.89697,249.06662,245.8987,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,47.32082,894.28471,809.86615,318.78559,435.04497,341.61467,334.30734,322.99589,190.61921,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


#### Ridgeline Plot for TimbreAvg2 & TimbreAvg3

In [36]:
# TODO: add a ridgeline plot for TimbreAvg2 and TimbreAvg3 

timbre_avgs

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12,decade
0,2001,49.94357,21.47114,73.07750,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,-2.46783,3.32136,-2.31521,d00
1,2001,48.73215,18.42930,70.32679,12.94636,-10.32437,-24.83777,8.76630,-0.92019,18.76548,4.59210,2.21920,0.34006,d00
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.54940,-3.27872,-2.35035,16.07017,1.39518,2.73553,0.82804,d00
3,2001,48.24750,-1.89837,36.29772,2.58776,0.97170,-26.21683,5.05097,-10.34124,3.55005,-6.36304,6.63016,-3.35142,d00
4,2001,50.97020,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,0.93609,1.60923,2.19223,d00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515340,2006,51.28467,45.88068,22.19582,-5.53319,-3.61835,-16.36914,2.12652,5.18160,-8.66890,2.67217,0.45234,2.51380,d00
515341,2006,49.87870,37.93125,18.65987,-3.63581,-27.75665,-18.52988,7.76108,3.56109,-2.50351,2.20175,-0.58487,-9.78657,d00
515342,2006,45.12852,12.65758,-38.72018,8.80882,-29.29985,-2.28706,-18.40424,-22.28726,-4.52429,-11.46411,3.28514,1.99943,d00
515343,2006,44.16614,32.38368,-3.34971,-2.49165,-19.59278,-18.67098,8.78428,4.02039,-12.01230,-0.74075,-1.26523,-4.41983,d00


In [44]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [49]:
# step = 20
# overlap = 1

# alt.Chart(timbre_avgs, height=step).mark_line(
#     interpolate="monotone", fillOpacity=0.8, stroke="lightgray", strokeWidth=0.5
# ).encode(
#     alt.X(
#         "TimbreAvg2:Q",
#         bin=True,      
#         title="Average Value of Timbre 2",
#     ),
#     alt.Y(
#         "count(TimbreAvg2):Q",
#         scale=alt.Scale(range=[step, -step * overlap]),
#         impute=alt.ImputeParams(value=0),
#         axis=None
#     ),
#     alt.Fill(
#         "mean(TimbreAvg2):Q",
#         legend=None,
#         scale=alt.Scale(domain=[40, 5], scheme="redyellowblue")
#     )
# ).facet(
#     row=alt.Row(
#         'decade:O',
#         title=None,
#         header=alt.Header(labelAngle=0, labelAlign='right', format='%B')
#     )
# ).properties(
#     bounds="flush", title="Timbre2 Average"
# ).configure_facet(
#     spacing=0
# ).configure_view(
#     stroke=None
# ).configure_title(
#     anchor="end"
# )