In [1]:
import json
from collections import Counter
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import feature_selection
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.cross_validation import ShuffleSplit, train_test_split
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import f1_score
import sklearn.learning_curve as curves
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from time import time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import normalize

In [2]:
def load_data():
    with open("song.json", 'rt') as inf:
        data = json.load(inf)
    
    data = pd.DataFrame(data)
    data.set_index('id', inplace = True)
    data['time_played'] = pd.to_datetime(data['time_played'])
    data['user_sign_up_date'] = pd.to_datetime(data['user_sign_up_date'])
    
    return data

data = load_data()

In [3]:
data.head()

Unnamed: 0_level_0,song_played,time_played,user_id,user_sign_up_date,user_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


### Q1:

In [4]:
data.groupby('user_state')['user_id'].nunique().nlargest(3)

user_state
New York      23
California    21
Texas         15
Name: user_id, dtype: int64

In [5]:
data.groupby('user_state')['user_id'].nunique().nsmallest(3)

user_state
Arizona        1
Connecticut    1
Idaho          1
Name: user_id, dtype: int64

### Q2:

In [6]:
def  count_hour(df):
    """average play event per hour' as a metric"""
    total_played = df.shape[0]
    first_play_dt = df['time_played'].min()
    last_play_dt = df['time_played'].max()
    duration = last_play_dt - first_play_dt
    duration_hours = duration.total_seconds() / 60.
    hr_average = total_played / duration_hours
    return pd.Series([first_play_dt, last_play_dt, duration, duration_hours, total_played, hr_average],
                    index = ['first_play_dt', 'last_play_dt', 'duration', 'duration_hours', 'total_played', 'hr_average'])


In [7]:
print "top 3 states in user engagement: \n", data.groupby('user_state').apply(count_hour)['hr_average'].nlargest(3)

top 3 states in user engagement: 
user_state
New York      0.011783
California    0.010699
Texas         0.005788
Name: hr_average, dtype: float64


In [8]:
print "bottom 3 states in user engagement: \n", data.groupby('user_state').apply(count_hour)['hr_average'].nsmallest(3)

bottom 3 states in user engagement: 
user_state
Kansas         0.000255
New Mexico     0.000432
Connecticut    0.000511
Name: hr_average, dtype: float64


### Q3:

In [9]:
for key, value in data.groupby('user_state'):
    print key, set(value['user_id'][value['user_sign_up_date'] == min(value['user_sign_up_date'])])

Alabama set([5])
Alaska set([106])
Arizona set([105])
Arkansas set([78])
California set([44, 39])
Colorado set([173, 166])
Connecticut set([127])
Florida set([41, 43])
Georgia set([16, 20])
Idaho set([165])
Illinois set([45])
Indiana set([102])
Iowa set([178])
Kansas set([177])
Kentucky set([34])
Louisiana set([50])
Maryland set([18])
Massachusetts set([15])
Michigan set([13])
Minnesota set([8, 21])
Mississippi set([26, 23])
Missouri set([85])
Nebraska set([134])
New Jersey set([6])
New Mexico set([4])
New York set([27, 10, 19, 12, 22])
North Carolina set([2])
North Dakota set([135])
Ohio set([3])
Oklahoma set([119])
Oregon set([1])
Pennsylvania set([11])
Rhode Island set([174])
South Carolina set([64])
Tennessee set([70])
Texas set([7])
Utah set([29])
Virginia set([142])
Washington set([125])
West Virginia set([60])
Wisconsin set([32])


### Q4

In [10]:
def count_by_song(df):
    return pd.Series(Counter(df['user_id']))

counts_by_songs = data.groupby('song_played').apply(count_by_song)
counts_by_songs = counts_by_songs.unstack(fill_value = 0)

counts_by_songs.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def similarity(df):
    df_normed = normalize(df, axis = 1)
    df_similarity = df_normed.dot(df_normed.T)
    return pd.DataFrame(df_similarity, index = df.index, columns = df.index)

songs_similarity = similarity(counts_by_songs)
songs_similarity.head()

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,1.0,0.235702,0.074536,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.037268,0.318198,0.35322,0.087841,0.0
A Hard Day's Night,0.235702,1.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.074536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.119523,0.0,0.0,1.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.183942,0.0,0.0,0.146845,0.0,0.0
Across The Universe,0.212132,0.1,0.0,0.0,1.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.043437,0.0,0.0


In [12]:
def most_similar_songs(s, topk):
    similar_ones = s.sort_values(ascending = False)[1: topk + 1].index.values
    return pd.Series(similar_ones, index = ['similar # {}'.format(i) for i in xrange(1, topk + 1)])
songs_similarity.apply(most_similar_songs, topk = 1, axis = 1)

Unnamed: 0_level_0,similar # 1
song_played,Unnamed: 1_level_1
A Day In The Life,Come Together
A Hard Day's Night,Come Together
A Saturday Club Xmas/Crimble Medley,GIRL
ANYTIME AT ALL,Can't Buy Me Love
Across The Universe,Revolution
All My Loving,Let It Be
All You Need Is Love,A Day In The Life
And Your Bird Can Sing,All My Loving
BAD BOY,Hey Jude
BALLAD OF JOHN AND YOKO,Golden Slumbers
