### Librairies

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

pd.options.display.max_columns = 999

### Lecture des données

In [None]:
df_labels = pd.read_json ('deezer_data/album_genres_new_releases.json' , lines = True )
df_data = pd.read_json ('deezer_data/sampled_geoloc_counters_new_releases.json' , lines = True )

### Pré-traitement des données

#### 1) Table Labels

In [None]:
binarized_df_labels = pd.get_dummies( df_labels , columns=['genre_name'] , prefix="" , prefix_sep="" )
df_multi_labels = binarized_df_labels.groupby(['album_id'] , as_index=False ).sum()
df_multi_labels.head()

#### 2) Table des données spatio-temporelle

In [None]:
df_data_ignoreTime = df_data.groupby(['album_id','loc_city'] , as_index=False )['nstreams','nusers'].sum()
df_data_ignoreTime_binarized =  pd.get_dummies( df_data_ignoreTime ,
                                               columns=['loc_city'] ,
                                               prefix="" , prefix_sep="" ).groupby(['album_id'] ,
                                                                                   as_index=False ).sum()
df_data_ignoreTime_binarized.head()

#### 3) Jointure

In [None]:
final_df = pd.merge(df_data_ignoreTime_binarized , df_multi_labels , on='album_id')
print(final_df.shape)
final_df.head()

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

# supprimer l'album_id qui nous intèresse pas
data = final_df.values[:,1:]

# le -1 c'est a cause de l'album_id 
features = data [ : , : df_data_ignoreTime_binarized.shape[1] - 1 ]
labels  = data [ :  , df_data_ignoreTime_binarized.shape[1] - 1  :  ]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    labels,
    test_size=0.2,
    random_state=1234,
)

In [None]:
# random_state  : pour fixer les mêmes résultats randoms 
clf  = tree.DecisionTreeClassifier( criterion='entropy',random_state=1234 )
clf.fit(X=X_train, y=y_train)
print ("accuracy of a decision Tree ",clf.score(X=X_test, y=y_test) ) 

In [None]:
df_data.sort_values(by=['album_id','d']).head()

# Analyse des données

1/ Le nombre moyen d'écoute d'un album par genre

In [None]:
df_labelized_data = df_data.join(df_labels.set_index('album_id'),on='album_id')

In [None]:
df_nusers_per_album = df_labelized_data.groupby(['genre_name','album_id']).sum()

In [None]:
df_meanusers_per_genre = df_nusers_per_album.groupby(['genre_name']).mean().reset_index().sort_values(by='nusers')
del df_meanusers_per_genre['d']
df_meanusers_per_genre.columns=['genre_name','meanstreams','meanusers']

In [None]:
plt.figure(1, figsize=(10, 15))
plt.barh(df_meanusers_per_genre['genre_name'],df_meanusers_per_genre['meanusers'])
plt.show()

In [None]:
df_meanusers_per_genre.tail()

In [None]:
df_meanusers_per_genre.head()

2/ Les top genres par age

In [None]:
df_nusers_per_agegroup = df_labelized_data.groupby(['age_group','genre_name']).sum()
del df_nusers_per_agegroup['d']
del df_nusers_per_agegroup['album_id']

In [None]:
best_1518_genres = df_nusers_per_agegroup.loc['15-18'].sort_values(by=['nusers'],ascending=False).head()

In [None]:
best_1518_genres

In [None]:
best_more55_genres = df_nusers_per_agegroup.loc['>55'].sort_values(by=['nusers'],ascending=False).head()

In [None]:
best_more55_genres

3/ Le top des ages par genre

In [None]:
df_nusers_per_genre_age = df_labelized_data.groupby(['genre_name','age_group']).sum()
del df_nusers_per_genre_age['d']
del df_nusers_per_genre_age['album_id']

In [None]:
df_nusers_per_genre_age.loc['Rap/Hip Hop'].sort_values(by='nusers',ascending=False).head()

In [None]:
df_nusers_per_genre_age.loc['Pop latine'].sort_values(by='nusers',ascending=False).head()

4/ Top genres par ville

In [None]:
df_nusers_per_genre_ville = df_labelized_data.groupby(['loc_city','genre_name']).sum()
del df_nusers_per_genre_ville['d']
del df_nusers_per_genre_ville['album_id']

In [None]:
df_nusers_per_genre_ville.loc['Paris'].sort_values(by='nusers',ascending=False).head()

In [None]:
df_nusers_per_genre_ville.loc['Marseille'].sort_values(by='nusers',ascending=False).head()

# Transformation des données

In [None]:
def transformAlbumData(data, nbTemps, nbJours):
    

In [None]:
pd.to_datetime('20160520', format='%Y%m%d', errors='ignore')+pd.Timedelta(days=14)

In [None]:
for i in range(len(df_data)):
    print(i)
    df_data.iloc[i] = pd.to_datetime(df_data.iloc[i], format='%Y%m%d', errors='ignore')

In [None]:
df_data