In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

In [2]:
timbre = pd.read_csv("year_prediction.csv")

In [3]:
timbre_avgs = timbre[["label", "TimbreAvg1", "TimbreAvg2", "TimbreAvg3", "TimbreAvg4", "TimbreAvg5", "TimbreAvg6", "TimbreAvg7", "TimbreAvg8", "TimbreAvg9", "TimbreAvg10", "TimbreAvg11", "TimbreAvg12"]]
timbre_avgs = timbre_avgs[timbre_avgs["label"] != 1922]

In [4]:
# since 1923 is missing between 1922 and 1924, we will just start our exploration from year 1924
timbre_avgs_by_year = timbre_avgs.groupby("label").mean().reset_index()
timbre_avgs_by_year.head(5)

Unnamed: 0,label,TimbreAvg1,TimbreAvg2,TimbreAvg3,TimbreAvg4,TimbreAvg5,TimbreAvg6,TimbreAvg7,TimbreAvg8,TimbreAvg9,TimbreAvg10,TimbreAvg11,TimbreAvg12
0,1924,36.945466,-136.050156,108.086914,12.35161,-7.730282,1.872898,-32.485724,3.362986,14.375366,8.708374,4.782966,8.06835
1,1925,34.359443,-128.216647,120.177701,19.37404,18.185651,17.01122,-56.728221,7.616783,3.783979,8.838653,6.741246,6.06128
2,1926,32.57212,-126.464657,88.917274,7.843321,-5.841375,13.394603,-31.141998,9.859703,-17.412415,4.974054,0.624522,-2.053934
3,1927,31.229186,-110.711278,59.213804,4.108315,-9.53837,9.985781,-21.01109,13.410433,-25.118038,4.396292,-0.394336,-2.645019
4,1928,31.499581,-94.417082,27.262832,-8.419641,-16.803719,9.830962,-34.226483,8.103334,-2.779955,7.653416,-0.274048,8.57516


## Heatmap: high level trend of Timbre usage across years

In [5]:
# combine timbre_avgs_by_year into a long dataset for Heatmap creation
timbre_avgs_long = timbre_avgs_by_year.melt("label", var_name = "TimbreType", value_name = "Average Values")

In [6]:
### !!!!!! need to re-sort TimbreType !!!!! ###
alt.Chart(timbre_avgs_long).mark_rect().encode(
    x='label:O',
    y='TimbreType:O',
    color='Average Values:Q'
)

### Heatmap Conclusion 
* Probably not the most organized visualization, but it does show some interesting general trends
* Observation 1: Timbre1 is consistently at a relatively high value
* Observation 2: 
    * Timbre2 was very low from 1920s to 1950s, but gradually stabalizes 
    * Timbre3 was very high from 1920s to 1950s, but also gradually stabalizes, to a very similar value as Timbre2 actaully
* Observation 3: with the exception of Timbre1, which is at the relatively high value, all other timbre is becoming more and more uniformly distributed as time period goes on
    * correlate to stability of society??
    * surprisingly: nothing significant for 2001 (9/11) --- maybe look more into the specific year to explore


## Decade-wise Distribution

#### Dataset Filtering: 1930 ~ 2010

In [100]:
### create dataset that shows decade information
decade_dict = dict()
# d20 = timbre_avgs[(timbre_avgs["label"] >= 1920) & (timbre_avgs["label"] < 1930)].drop("label", axis = 1).sum()
d30 = timbre_avgs[(timbre_avgs["label"] >= 1930) & (timbre_avgs["label"] < 1940)].drop("label", axis = 1).mean()
d40 = timbre_avgs[(timbre_avgs["label"] >= 1940) & (timbre_avgs["label"] < 1950)].drop("label", axis = 1).mean()
d50 = timbre_avgs[(timbre_avgs["label"] >= 1950) & (timbre_avgs["label"] < 1960)].drop("label", axis = 1).mean()
d60 = timbre_avgs[(timbre_avgs["label"] >= 1960) & (timbre_avgs["label"] < 1970)].drop("label", axis = 1).mean()
d70 = timbre_avgs[(timbre_avgs["label"] >= 1970) & (timbre_avgs["label"] < 1980)].drop("label", axis = 1).mean()
d80 = timbre_avgs[(timbre_avgs["label"] >= 1980) & (timbre_avgs["label"] < 1990)].drop("label", axis = 1).mean()
d90 = timbre_avgs[(timbre_avgs["label"] >= 1990) & (timbre_avgs["label"] < 2000)].drop("label", axis = 1).mean()
d00 = timbre_avgs[(timbre_avgs["label"] >= 2000) & (timbre_avgs["label"] < 2010)].drop("label", axis = 1).mean()
# decade_dict["20s"] = d20
decade_dict["30s"] = d30
decade_dict["40s"] = d40
decade_dict["50s"] = d50
decade_dict["60s"] = d60
decade_dict["70s"] = d70
decade_dict["80s"] = d80
decade_dict["90s"] = d90
decade_dict["00s"] = d00

In [101]:
decade_tim = pd.DataFrame(decade_dict).transpose().reset_index().melt("index", var_name = "TimbreType", value_name = "Average Values")
# decade_tim

In [102]:
alt.Chart(decade_tim).mark_rect().encode(
    x='index:O',
    y='TimbreType:O',
    color='Average Values:Q'
)