# Classical Music Kaggle Competition

Author: Zach Xing

In [1]:
import pandas as pd
import numpy as np


### Description
We have received several data files as described below to use in our model. Our training dataset in train.csv contains account IDs of patrons along with a target label (0 or 1) indicating whether they purchased a subscription for the 2014-15 concert season. Our objective is to predict whether the patrons listed in the test.csv file will purchase subscriptions or not. Be sure that your output file matches exactly with the format in the sample_submissions.csv file, including the same column headings. You should submit your "soft predictions" (your probabilistic prediction that they will purchase a subscription), NOT discrete 0/1 predictions, since our scoring metric for this competition is AUROC.



### Data

train.csv - the training set containing target labels indicating whether the patrons have purchased a 2014-15 subscription or not

test.csv - the test set of accounts for which we are to make a prediction

sample_submission.csv - a sample submission file in the correct format

account.csv - location info for each patron and donation history

tickets_all.csv - previously purchased tickets by account

subscriptions.csv - previously purchased subscriptions by account

concerts.csv - previous concerts by season.

concerts_2014-15.csv - list of planned concert sets for the 2014-15 season

zipcodes.csv - location and demographic information for zipcodes

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_account = pd.read_csv('account.csv', encoding='latin-1')
df_tickets = pd.read_csv('tickets_all.csv')
df_sub = pd.read_csv('subscriptions.csv')
df_concerts = pd.read_csv('concerts.csv')
df_concerts1415 = pd.read_csv('concerts_2014-15.csv')
df_zipcodes = pd.read_csv('zipcodes.csv')

In [3]:
df_sub_counts = df_sub.groupby('account.id').count()

In [4]:
df_train_1 = df_train.merge(df_account, on='account.id', how='left')


In [5]:
df_train_1 = df_train_1.merge(df_sub_counts, on='account.id', how='left')


In [6]:
df_train_1 = df_train_1[['account.id', 'label', 'amount.donated.2013', 'amount.donated.lifetime', 'season']].fillna(0)

In [16]:
# split the train data into train and validation
from sklearn.model_selection import train_test_split
df_train_1, df_val = train_test_split(df_train_1, test_size=0.2, random_state=18)

In [17]:
# develop a xgboost model, using only the training data. Use auroc as the metric
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

X = df_train_1[['amount.donated.2013', 'amount.donated.lifetime', 'season']]
y = df_train_1['label']
# create a linear regression model
logreg = linear_model.LogisticRegression(C=1e5)
# train the model
logreg.fit(X, y)
# predict on the validation data, use "soft predictions" (your probabilistic prediction)
y_pred = logreg.predict_proba(df_val[['amount.donated.2013', 'amount.donated.lifetime', 'season']])[:, 1]

# calculate the auroc
roc_auc_score(df_val['label'], y_pred)


0.8558263971462546

In [23]:
# reset the column 'ID' to 'account.id'
df_test = df_test.rename(columns={'ID': 'account.id'})
# predict on the test data
df_test_1 = df_test.merge(df_account, on='account.id', how='left')
df_test_1 = df_test_1.merge(df_sub_counts, on='account.id', how='left')
df_test_1 = df_test_1[['account.id', 'amount.donated.2013', 'amount.donated.lifetime', 'season']].fillna(0)
df_test_1['label'] = logreg.predict_proba(df_test_1[['amount.donated.2013', 'amount.donated.lifetime', 'season']])[:, 1]
df_test_1 = df_test_1[['account.id', 'label']]
df_test_1 = df_test_1.rename(columns={'account.id': 'ID', 'label': 'Predicted'})
df_test_1.to_csv('submission_test.csv', index=False)

In [13]:
y_pred

array([0.01839754, 0.01385252, 0.02460675, ..., 0.01385252, 0.01850354,
       0.01850354])

In [10]:
df_sub

Unnamed: 0,account.id,season,package,no.seats,location,section,price.level,subscription_tier,multiple.subs
0,001i000000LhyR3,2009-2010,Quartet,2,San Francisco,Premium Orchestra,1.0,1.0,no
1,001i000000NuOeY,2000-2001,Full,2,San Francisco,Orchestra,2.0,2.0,no
2,001i000000NuNvb,2001-2002,Full,2,Berkeley Saturday,Balcony Front,3.0,2.0,no
3,001i000000NuOIz,1993-1994,Quartet,1,Contra Costa,Orchestra,2.0,0.5,no
4,001i000000NuNVE,1998-1999,Full,2,Berkeley Sunday,Balcony Rear,4.0,2.0,no
...,...,...,...,...,...,...,...,...,...
28622,001i000000NuOE8,1994-1995,Full,3,Santa Rosa,Balcony,4.0,3.0,no
28623,001i000000NuPnA,2006-2007,Full,2,Peninsula,Balcony Front,3.0,2.0,no
28624,001i000000Lhyc6,2009-2010,Full,4,San Francisco,Dress Circle,3.0,4.0,no
28625,001i000000NuOhT,1995-1996,Full,2,Santa Rosa,Balcony,4.0,2.0,no
