In [1]:
# load packages 
import os 
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
from altair_saver import save
import seaborn as sns

In [2]:
# load original data
timbre_path = os.path.join('data', 'year_prediction.csv')
timbre_avg = pd.read_csv(timbre_path).rename(columns={'label': 'year'})  # rename column label into year
timbre_avg.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,...,TimbreCovariance69,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


## Heatmap: high level trend of Timbre usage across years

In [3]:
# read in preprocess 
timbre_cols = [f'TimbreAvg{i}' for i in range(1, 13)]
timbre_avg_by_year = pd.read_csv('preprocess/timbre_avg_by_year.csv')
timbre_avg_by_year.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12,decade
0,1922,41.563847,13.908623,14.684642,24.427253,18.752738,11.17328,-5.309713,0.136818,-2.16483,-4.444308,-1.621852,-5.560913,d20
1,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835,d20
2,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128,d20
3,1926,32.57212,-126.464657,88.917274,7.843321,-5.841375,13.394603,-31.141998,9.859703,-17.412415,4.974054,0.624522,-2.053934,d20
4,1927,31.229186,-110.711278,59.213804,4.108315,-9.53837,9.985781,-21.01109,13.410433,-25.118038,4.396292,-0.394336,-2.645019,d20


In [5]:
# combine timbre_avgs_by_year into a long dataset for Heatmap creation
timbre_avg_long = timbre_avg_by_year.drop(
    columns=['decade']
    ).melt("year", var_name = "TimbreType", value_name = "Average Values")

In [6]:
# heatmap, plot average timbre features by year 
yearly_timbre = alt.Chart(timbre_avg_long).mark_rect().encode(
    x='year:O',
    y=alt.Y('TimbreType:O', sort=timbre_cols), 
    color='Average Values:Q',
    tooltip=['year', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Years',
    width=2000, height=300
)
yearly_timbre

### Heatmap Conclusion 
* Probably not the most organized visualization, but it does show some interesting general trends
* Observation 1: Timbre1 is consistently at a relatively high value
* Observation 2: 
    * Timbre2 was very low from 1920s to 1950s, but gradually stabalizes 
    * Timbre3 was very high from 1920s to 1950s, but also gradually stabalizes, to a very similar value as Timbre2 actaully
* Observation 3: with the exception of Timbre1, which is at the relatively high value, all other timbre is becoming more and more uniformly distributed as time period goes on
    * correlate to stability of society??
    * surprisingly: nothing significant for 2001 (9/11) --- maybe look more into the specific year to explore


## Decade-wise Distribution

#### Dataset Filtering: 1930 ~ 2010

In [8]:
# compute decades 
timbre_avg_by_decade = pd.read_csv('preprocess/timbre_avg_by_decade.csv')
timbre_avg_by_decade_long = timbre_avg_by_decade.melt("decade", var_name='TimbreType', value_name='Average Values')

In [9]:
# plot timbre feature averages by decades 
decade_seq = [f'd{s}0' for s in [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]]
alt.Chart(timbre_avg_by_decade_long).mark_rect().encode(
    x=alt.X('decade:O', sort=decade_seq),
    y=alt.Y('TimbreType:O', sort=timbre_cols),
    color='Average Values:Q',
    tooltip=['decade', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Decade'
)

In [10]:
# TODO: add observations to the decade scale  (less urgent)

#### Heatmap of Covariance

In [11]:
# TODO: add a heatmap plotting the covariance of the 12 columns (using the year scale, not the decade scale; pick either lower triangular or upper)
# covar = pd.concat([timbre[["year"]], timbre.iloc[:, 13:len(timbre.columns)]], axis = 1)    # dataframe of all covariances
audio = timbre_avg.iloc[:, 1:13]
corr = audio.corr().reset_index()
corr.head()

Unnamed: 0,index,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12
0,TimbreAvg1,1.0,0.561747,0.245415,0.02227,-0.283635,-0.26782,0.170659,-0.057822,0.218736,0.103704,0.063742,-0.135719
1,TimbreAvg2,0.561747,1.0,0.018479,0.126845,-0.184864,-0.112388,0.115468,0.108088,0.126086,0.371357,-0.087246,-0.058013
2,TimbreAvg3,0.245415,0.018479,1.0,0.152488,-0.131741,0.040808,-0.057642,0.080881,0.043885,-0.089869,0.041447,-0.020443
3,TimbreAvg4,0.02227,0.126845,0.152488,1.0,0.034988,0.321159,0.282162,0.031649,-0.037905,0.165343,0.309592,-0.103036
4,TimbreAvg5,-0.283635,-0.184864,-0.131741,0.034988,1.0,0.015561,-0.10311,-0.013265,-0.217551,-0.104766,0.016072,0.030181


In [12]:
timbre_covs_long = corr.melt("index", var_name = "TimbreCorr", value_name = "Corr Values")
timbre_covs_long.head()

Unnamed: 0,index,TimbreCorr,Corr Values
0,TimbreAvg1,TimbreAvg1,1.0
1,TimbreAvg2,TimbreAvg1,0.561747
2,TimbreAvg3,TimbreAvg1,0.245415
3,TimbreAvg4,TimbreAvg1,0.02227
4,TimbreAvg5,TimbreAvg1,-0.283635


In [13]:
# timbre covariances # 
alt.Chart(timbre_covs_long).mark_rect().encode(
    x=alt.X('index:O', sort=timbre_cols),
    y=alt.Y('TimbreCorr:O', sort=timbre_cols),
    color='Corr Values:Q',
    tooltip=['index', 'TimbreCorr', 'Corr Values']
).properties(
    title='Average Timbre Correlations'
)

#### Ridgeline Plot for TimbreAvg2 & TimbreAvg3

In [14]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [19]:
from preprocess import transform_year_to_decade

timbre_avg['decade'] = timbre_avg.year.transform(transform_year_to_decade)
test_set = timbre_avg[['year', 'TimbreAvg2']].loc[
    (1970 < timbre_avg['year']) & (timbre_avg['year'] < 1980)].sample(5000).sort_values('year')# .rename(columns={'year':'label'})
test_set

Unnamed: 0,year,TimbreAvg2
77524,1971,-1.80602
274520,1971,10.22412
492161,1971,-35.15962
244180,1971,77.16227
132453,1971,-91.04304
...,...,...
236433,1979,-10.22308
441952,1979,78.71596
491561,1979,32.80122
135996,1979,-2.44962


In [20]:
# make a plot?
step = 30   # adjust height of each kde
overlap = 1

alt.Chart(test_set, height=step).transform_joinaggregate(
    mean_timbre2='mean(TimbreAvg2)', groupby=['year']
).transform_bin(
    ['bin_max', 'bin_min'], 'TimbreAvg2'
).transform_aggregate(
    value='count()', groupby=['year', 'mean_timbre2', 'bin_min', 'bin_max']
).transform_impute(
    impute='value', groupby=['year', 'mean_timbre2'], key='bin_min', value=0
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='lightgray',
    strokeWidth=0.5
).encode(
    alt.X('bin_min:Q', bin='binned', title='Timbre 2 Average Over the Decades'),
    alt.Y(
        'value:Q',
        scale=alt.Scale(range=[step, -step * overlap]),   
        axis=None
    ),
    alt.Fill(
        'mean_timbre2:Q',
        legend=None,
        scale=alt.Scale(domain=[10, -50], scheme='redyellowblue')  # adjust color 
    )
).facet(
    row=alt.Row(
        'year:T',    # only accepts T type: convert things to T type first 
        title=None,
        header=alt.Header(labelAngle=0, labelAlign='right', format='%B')
    )
).properties(
    title='Timbre Avg by Year (Ridgeline)',
    bounds='flush'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).configure_title(
    anchor='end'
)