In [115]:
# load packages 
import os 
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
from altair_saver import save
import seaborn as sns
import datetime
from vega_datasets import data

In [116]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
# load original data
timbre_path = os.path.join('data', 'year_prediction.csv')
timbre_avg = pd.read_csv(timbre_path).rename(columns={'label': 'year'})  # rename column label into year
timbre_avg.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,...,TimbreCovariance69,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


## Heatmap: high level trend of Timbre usage across years

In [118]:
# read in preprocess 
timbre_cols = [f'TimbreAvg{i}' for i in range(1, 13)]
timbre_avg_by_year = pd.read_csv('preprocess/timbre_avg_by_year.csv')
timbre_avg_by_year.head()

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12,decade
0,1922,41.563847,13.908623,14.684642,24.427253,18.752738,11.17328,-5.309713,0.136818,-2.16483,-4.444308,-1.621852,-5.560913,d20
1,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835,d20
2,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128,d20
3,1926,32.57212,-126.464657,88.917274,7.843321,-5.841375,13.394603,-31.141998,9.859703,-17.412415,4.974054,0.624522,-2.053934,d20
4,1927,31.229186,-110.711278,59.213804,4.108315,-9.53837,9.985781,-21.01109,13.410433,-25.118038,4.396292,-0.394336,-2.645019,d20


In [119]:
# combine timbre_avgs_by_year into a long dataset for Heatmap creation
timbre_avg_long = timbre_avg_by_year.drop(
    columns=['decade']
    ).melt("year", var_name = "TimbreType", value_name = "Average Values")

In [120]:
# heatmap, plot average timbre features by year 
yearly_timbre = alt.Chart(timbre_avg_long).mark_rect().encode(
    x='year:O',
    y=alt.Y('TimbreType:O', sort=timbre_cols), 
    color='Average Values:Q',
    tooltip=['year', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Years',
    width=2000, height=300
)
yearly_timbre

### Heatmap Conclusion 
* Probably not the most organized visualization, but it does show some interesting general trends
* Observation 1: Timbre1 is consistently at a relatively high value
* Observation 2: 
    * Timbre2 was very low from 1920s to 1950s, but gradually stabalizes 
    * Timbre3 was very high from 1920s to 1950s, but also gradually stabalizes, to a very similar value as Timbre2 actaully
* Observation 3: with the exception of Timbre1, which is at the relatively high value, all other timbre is becoming more and more uniformly distributed as time period goes on
    * correlate to stability of society??
    * surprisingly: nothing significant for 2001 (9/11) --- maybe look more into the specific year to explore


## Decade-wise Distribution

#### Dataset Filtering: 1930 ~ 2010

In [121]:
# compute decades 
timbre_avg_by_decade = pd.read_csv('preprocess/timbre_avg_by_decade.csv')
timbre_avg_by_decade_long = timbre_avg_by_decade.melt("decade", var_name='TimbreType', value_name='Average Values')

In [122]:
# plot timbre feature averages by decades 
decade_seq = [f'd{s}0' for s in [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]]
alt.Chart(timbre_avg_by_decade_long).mark_rect().encode(
    x=alt.X('decade:O', sort=decade_seq),
    y=alt.Y('TimbreType:O', sort=timbre_cols),
    color='Average Values:Q',
    tooltip=['decade', 'TimbreType', 'Average Values']
).properties(
    title='Average Timbre Features by Decade'
)

In [9]:
# TODO: add observations to the decade scale  (less urgent)

#### Heatmap of Covariance

In [123]:
# TODO: add a heatmap plotting the covariance of the 12 columns (using the year scale, not the decade scale; pick either lower triangular or upper)
# covar = pd.concat([timbre[["year"]], timbre.iloc[:, 13:len(timbre.columns)]], axis = 1)    # dataframe of all covariances
audio = timbre_avg.iloc[:, 1:13]
corr = audio.corr().reset_index()
corr.head()

Unnamed: 0,index,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12
0,TimbreAvg1,1.0,0.561747,0.245415,0.02227,-0.283635,-0.26782,0.170659,-0.057822,0.218736,0.103704,0.063742,-0.135719
1,TimbreAvg2,0.561747,1.0,0.018479,0.126845,-0.184864,-0.112388,0.115468,0.108088,0.126086,0.371357,-0.087246,-0.058013
2,TimbreAvg3,0.245415,0.018479,1.0,0.152488,-0.131741,0.040808,-0.057642,0.080881,0.043885,-0.089869,0.041447,-0.020443
3,TimbreAvg4,0.02227,0.126845,0.152488,1.0,0.034988,0.321159,0.282162,0.031649,-0.037905,0.165343,0.309592,-0.103036
4,TimbreAvg5,-0.283635,-0.184864,-0.131741,0.034988,1.0,0.015561,-0.10311,-0.013265,-0.217551,-0.104766,0.016072,0.030181


In [124]:
timbre_covs_long = corr.melt("index", var_name = "TimbreCorr", value_name = "Corr Values")
timbre_covs_long.head()

Unnamed: 0,index,TimbreCorr,Corr Values
0,TimbreAvg1,TimbreAvg1,1.0
1,TimbreAvg2,TimbreAvg1,0.561747
2,TimbreAvg3,TimbreAvg1,0.245415
3,TimbreAvg4,TimbreAvg1,0.02227
4,TimbreAvg5,TimbreAvg1,-0.283635


In [125]:
# timbre covariances # 
alt.Chart(timbre_covs_long).mark_rect().encode(
    x=alt.X('index:O', sort=timbre_cols),
    y=alt.Y('TimbreCorr:O', sort=timbre_cols),
    color='Corr Values:Q',
    tooltip=['index', 'TimbreCorr', 'Corr Values']
).properties(
    title='Average Timbre Correlations'
)

#### Ridgeline Plot for TimbreAvg2 & TimbreAvg3

In [126]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [127]:
#TODO: a function to transoform year into their decades in the form of 19X0s or 20X0s
def year_to_decade_datetime(year):
    """ convert year into decade in for the format that can be recognized as datetime object"""
    return str(year)[:3] + "0"

def transform_date(year): 
    """ transform year to it's datetime object, auto-filling in January 1st"""
    return datetime.datetime.strptime(str(year), "%Y")

In [128]:
# TODO: transform year to the corresponding decades
timbre_avg_by_year["decade"] = timbre_avg_by_year.year.transform(year_to_decade_datetime)

In [129]:
# change name of "year" column to avoid datetime object confusion
timbre_avg_by_year = timbre_avg_by_year.rename({"year": "music_year"}, axis = 1)

In [130]:
timbre_avg_by_year.head(3)

Unnamed: 0,music_year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12,decade
0,1922,41.563847,13.908623,14.684642,24.427253,18.752738,11.17328,-5.309713,0.136818,-2.16483,-4.444308,-1.621852,-5.560913,1920
1,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835,1920
2,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128,1920


In [133]:
# TODO: function to graph Ridgeline Plot
def plot_ridgeline(input_df, time_unit_field, value_col):
    """ graph ridgeline plot of given dataframe, time unit field and value column"""
    """ df: target dataframe """
    """ time_unit_field: the value will dictate what separates the lines of each ridgeline """
    """ value_col: the distribution to explore"""
    df = input_df.copy()
    df[time_unit_field] = df[time_unit_field].apply(transform_date)
    step = 30   # adjust height of each kde
    overlap = 1
    to_transform = 'mean(' + value_col + ')'
    ridgeline = alt.Chart(df, height=step).transform_timeunit(
        as_ = "Decade", timeUnit="year", field=time_unit_field
    ).transform_joinaggregate(
        mean_val=to_transform, groupby=["Decade"]
    ).transform_bin(
        ['bin_max', 'bin_min'], value_col
    ).transform_aggregate(
        value='count()', groupby=["Decade", 'mean_val', 'bin_min', 'bin_max']
    ).transform_impute(
        impute='value', groupby=['Decade', 'mean_val'], key='bin_min', value=0
    ).mark_area(
        interpolate='monotone',
        fillOpacity=0.8,
        stroke='lightgray',
        strokeWidth=0.5
    ).encode(
        alt.X('bin_min:Q', bin='binned', title='Timbre 2 Average By Decades'),
        alt.Y(
            'value:Q',
            scale=alt.Scale(range=[step, -step * overlap]),   
            axis=None
        ),
        alt.Fill(
            'mean_val:Q',
            legend=None,
            scale=alt.Scale(domain=[20, -50], scheme='redyellowblue')  # adjust color 
        )
    ).facet(
        row=alt.Row(
            'Decade:T',    # only accepts T type: convert things to T type first 
            #timeUnit = "year",
            title=None,
            header=alt.Header(labelAngle=0, labelAlign='right', format='%Y'+"s")
        )
    ).properties(
        title='Timbre Avg by Decade (Ridgeline)',
        bounds='flush'
    ).configure_facet(
        spacing=0
    ).configure_view(
        stroke=None
    ).configure_title(
        anchor='end'
    )
    return ridgeline

    
    

In [134]:
plot_ridgeline(timbre_avg_by_year, "decade", "TimbreAvg2")

In [121]:
from preprocess import transform_year_to_decade
timbre_avg['decade'] = timbre_avg.year.transform(transform_year_to_decade)
# alternatively: test set can be the one to pass into function
# test_set = timbre_avg[['year', 'TimbreAvg2']].loc[
#     (1970 < timbre_avg['year']) & (timbre_avg['year'] < 1980)].sample(5000).sort_values('year')# .rename(columns={'year':'label'})
# test_set
timbre_avg.head(3)

Unnamed: 0,year,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,...,TimbreCovariance70,TimbreCovariance71,TimbreCovariance72,TimbreCovariance73,TimbreCovariance74,TimbreCovariance75,TimbreCovariance76,TimbreCovariance77,TimbreCovariance78,decade
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327,d00
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061,d00
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345,d00


In [127]:
# make a plot?
step = 30   # adjust height of each kde
overlap = 1

alt.Chart(test_set, height=step).transform_timeunit(
    as_ = "Year", timeUnit="year", field="yearly"
).transform_joinaggregate(
    mean_timbre2='mean(TimbreAvg2)', groupby=['Year']
).transform_bin(
    ['bin_max', 'bin_min'], 'TimbreAvg2'
).transform_aggregate(
    value='count()', groupby=['Year', 'mean_timbre2', 'bin_min', 'bin_max']
).transform_impute(
    impute='value', groupby=['Year', 'mean_timbre2'], key='bin_min', value=0
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='lightgray',
    strokeWidth=0.5
).encode(
    alt.X('bin_min:Q', bin='binned', title='Timbre 2 Average By Decades'),
    alt.Y(
        'value:Q',
        scale=alt.Scale(range=[step, -step * overlap]),   
        axis=None
    ),
    alt.Fill(
        'mean_timbre2:Q',
        legend=None,
        scale=alt.Scale(domain=[10, -20], scheme='redyellowblue')  # adjust color 
    )
).facet(
    row=alt.Row(
        'Year:T',    # only accepts T type: convert things to T type first 
        #timeUnit = "year",
        title=None,
        header=alt.Header(labelAngle=0, labelAlign='right', format='%Y'+"s")
    )
).properties(
    title='Timbre Avg by Decade (Ridgeline)',
    bounds='flush'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).configure_title(
    anchor='end'
)

In [49]:
source = data.seattle_weather()

source.head()

In [50]:
source

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,fog
1457,2015-12-28,1.5,5.0,1.7,1.3,fog
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun
