## Number of Users
### 1. How many users are in the dataset?
### 2. How many users are active users?

In [117]:
import pandas as pd
import matplotlib.pyplot as plt

df_user = pd.read_csv('./users.tsv',sep='\t')

df_listen = pd.read_csv('./listens.tsv',sep='\t')
df_artist = pd.read_csv('./artists.tsv',sep='\t')
df_join = pd.merge(df_user, df_listen, on='profile_id')

print("Total Users:"+str(len(df_user.profile_id.unique())))
print("Total Active Users:"+str(len(df_join.profile_id.unique())))





Total Users:161803
Total Active Users:153115


In [148]:
#Data Profiling
print(len(df_listen['artist_seed'].unique()))
print(len(df_listen['profile_id'].unique()))
print(len(df_listen['tracks_listened_to'].unique()))



500
153115
505


## User Age
### 1. average age of active listener

In [110]:
X1 = df_user[(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().mean()
print(X1)

35.9748424968


### 2. average age of inactive listener

In [109]:
X2 = df_user[~(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().mean()
print(X2)

39.2501940994



### 3. Significance of User Mean

In [189]:
from numpy import sqrt, abs, round
from scipy.stats import norm
from scipy import stats


#standard deviations
sd1 = df_user[(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().std()
sd2 = df_user[~(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().std()
#sample sizes
n1 = df_user[(df_user.profile_id.isin(df_listen.profile_id))].age.count()
n2 = df_user[~(df_user.profile_id.isin(df_listen.profile_id))].age.count()
#two samples
x1_array = df_user[(df_user.profile_id.isin(df_listen.profile_id))].age.dropna()
x2_array = df_user[~(df_user.profile_id.isin(df_listen.profile_id))].age.dropna()
# means
mu1 = df_user[(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().mean()
mu2 = df_user[~(df_user.profile_id.isin(df_listen.profile_id))].age.dropna().mean()

# we can see that the two sets of data are actually not normally distributed. However as our sample is large enough
print(stats.normaltest(x1_array))
print(stats.normaltest(x2_array))


ttest=stats.ttest_ind(x1_array,x2_array)
print(ttest)

# Our null hypothesis states that there is no difference between two population samples.
# we can see that p-value is very close to zero, less than the significance leve(0.05), we can reject the null hypothesis



NormaltestResult(statistic=5172.3018044162272, pvalue=0.0)
NormaltestResult(statistic=168.09726520934271, pvalue=3.1487823926918672e-37)
Ttest_indResult(statistic=-20.090121748002499, pvalue=1.3646342218184833e-89)


## Demographic Visualization

In [144]:
# The code below retrieves top genre by gender
df1 = df_join.merge(df_artist, left_on ='artist_seed',right_on ='artist_id')
x = df1.groupby(['gender','genre']).size()
x.groupby(level=0).nlargest(10)#.to_csv("gender.csv")



gender  gender  genre     
FEMALE  FEMALE  Pop           241714
                R&B           174232
                Rap           125931
                Country       103443
                Rock           94083
                Latin          27172
                Religious      24696
                Electronic     11704
                Reggae          3446
                Folk            3281
MALE    MALE    Rock          170009
                Rap           151763
                Country        87988
                Pop            83580
                R&B            62350
                Latin          17710
                Electronic     12860
                Religious      11899
                Reggae          4175
                Blues           2321
dtype: int64

In [145]:
# The code below retrieves top genre by age group
bins = [0, 10, 20, 30, 40, 50, 60, 70]
df1['age'] = pd.cut(df1['age'], bins)
y = df1.groupby(['age','genre']).size()
y.groupby(level=0).nlargest(10)




age       age       genre     
(10, 20]  (10, 20]  Pop           24851
                    Rap           18836
                    R&B           12837
                    Rock          12084
                    Country        9889
                    Latin          2532
                    Electronic     1399
                    Religious       989
                    Folk            328
                    Reggae          265
(20, 30]  (20, 30]  Rap           85430
                    Pop           76409
                    R&B           59964
                    Rock          55177
                    Country       46168
                    Latin         13400
                    Electronic     7229
                    Religious      5563
                    Reggae         1684
                    Folk           1102
(30, 40]  (30, 40]  Pop           78973
                    Rap           71710
                    R&B           60838
                    Rock          56556
         