In [43]:
%load_ext autoreload
%autoreload

## This expands a notebook to full width
from IPython.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
## Show Python Version
import sys
print("Python: {0}".format(sys.version))

## Show Current Time
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.10.9 | packaged by conda-forge | (main, Jan 11 2023, 15:15:40) [MSC v.1916 64 bit (AMD64)]
Notebook Last Run Initiated: 2023-03-13 19:39:37.515809


In [44]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import sklearn.metrics as sklearn_metrics
from sklearn import mixture

import math

import matplotlib.pyplot as plt

from kneed import KneeLocator

import pandas as pd
import numpy as np

In [45]:
def silhouette_score(X, labels):
    """Wrapper function of Scikit-learn's silhouette_score. The only difference is it doesn't throw an error where there is only one label."""
  
    if len(set(labels)) == 1:
        return float("NaN")
    else:
        return sklearn_metrics.silhouette_score(X, labels)

In [46]:
Keys = ['accountid','userid']

object_features  = ['postal_code','city','county','state','rural_code','key_zips_ind','gender','marital_status','age','usercreationdate','engaged','camera_count','firsttransaction','ratio_open30']
numeric_features = ['datediff_create_trans','total_opens_last_90t180','ratio_open_30','ratio_open_60']
newfeatures      = ['opens_90t180','ratio_open_30','ratio_open_60']

In [47]:
data = pd.read_csv('data/train_data.csv',low_memory=False)

In [48]:
data = data.fillna(0)
data = data.loc[data.smart_garage_camera_count > 0]
data.datediff_create_trans = np.round(data.datediff_create_trans,0)
data = data.reset_index(drop=True)

In [49]:
data.key_zips_ind[data.key_zips_ind == 1] = 'keyservice'
data.key_zips_ind[data.key_zips_ind == 0] = 'nokeyservice'

In [50]:
data.gender[data.gender == 0] = 'gender_other'
data.gender[data.gender == 'unknown'] = 'gender_other'
data.gender[data.gender == 'M'] = 'gender_m'
data.gender[data.gender == 'F'] = 'gender_f'

In [51]:
data.marital_status[data.marital_status == 0] = 'ms_other'
data.marital_status[data.marital_status == 'unknown'] = 'ms_other'
data.marital_status[data.marital_status == 'M'] = 'ms_m'
data.marital_status[data.marital_status == 'S'] = 'ms_s'
data.marital_status[data.marital_status == 'B'] = 'ms_b'
data.marital_status[data.marital_status == 'A'] = 'ms_a'

In [52]:
data.rural_code[data.rural_code == 0] = 'rc_unk'
data.rural_code[data.rural_code == 'S'] = 'rc_s'
data.rural_code[data.rural_code == 'U'] = 'rc_u'
data.rural_code[data.rural_code == 'R'] = 'rc_r'

In [53]:
data.loc[data.age_1stperson_hushld == 0,'age'] = 'age_unk'
data.loc[data.age_1stperson_hushld > 50,'age'] = 'age_senior'
data.loc[(data.age_1stperson_hushld > 0) & (data.age_1stperson_hushld <= 50),'age'] = 'age_adult'

In [54]:
data.loc[data.myq_user_creationDate <= 9.160061599,'usercreationdate'] = 'established_user'
data.loc[data.myq_user_creationDate > 9.160061599,'usercreationdate']  = 'new_user'

In [55]:
data.loc[(data.engaged_prob >= 0) & (data.engaged_prob <= 0.91),'engaged'] = 'neutral'
data.loc[(data.engaged_prob > 0.91) & (data.engaged_prob <= 1.34),'engaged'] = 'somewhat_engaged'
data.loc[(data.engaged_prob > 1.34) & (data.engaged_prob <= 1.75),'engaged'] = 'engaged'
data.loc[data.engaged_prob > 1.75,'engaged'] = 'strongly_engaged'

In [56]:
data.loc[data.smart_garage_camera_count == 1,'camera_count'] = 'single_camera'
data.loc[data.smart_garage_camera_count > 1 ,'camera_count'] = 'multi_camera'

In [57]:
data.loc[data['datediff_create_trans'] <= 5, 'firsttransaction'] = 'immediate'
data.loc[(data['datediff_create_trans'] > 5) & (data['datediff_create_trans'] <= 30),'firsttransaction'] = 'fast'
data.loc[(data['datediff_create_trans'] > 30) & (data['datediff_create_trans'] <= 60), 'firsttransaction'] = 'delayed'
data.loc[(data['datediff_create_trans'] > 60),'firsttransaction'] = 'slow'

In [58]:
data.loc[data['ratio_open_30'] <= 0.2,'ratio_open30'] = 'low_frequency_ratio30'
data.loc[(data['ratio_open_30'] > 0.2) & (data['ratio_open_30'] <= 0.6),'ratio_open30'] = 'mediumfrequencyratio30'
data.loc[data['ratio_open_30'] > 0.6,'ratio_open30'] = 'highfrequencyratio30'

In [59]:
new_data = pd.concat([data[Keys+object_features]],axis=1)

In [60]:
new_data.head()

Unnamed: 0,accountid,userid,postal_code,city,county,state,rural_code,key_zips_ind,gender,marital_status,age,usercreationdate,engaged,camera_count,firsttransaction,ratio_open30
0,B24CB6E7-C8E1-4B8D-9778-B724F67E05B2,64841C33-4A5A-4E8C-BB61-BDD70F326E90,85226,chandler,maricopa,az,rc_s,keyservice,gender_other,ms_other,age_unk,new_user,strongly_engaged,single_camera,fast,low_frequency_ratio30
1,F51FC1D9-5D37-4D34-AB54-1F41B9BD8C06,F51FC1D9-5D37-4D34-AB54-1F41B9BD8C06,98032,kent,king,wa,rc_u,keyservice,gender_m,ms_m,age_adult,established_user,strongly_engaged,multi_camera,slow,mediumfrequencyratio30
2,2036E661-F89B-46CA-AD81-1AD78CA8C91E,913AEF55-040B-4F53-BA0A-7F136B4368F6,75204,dallas,dallas,tx,rc_u,keyservice,gender_m,ms_m,age_adult,new_user,strongly_engaged,single_camera,immediate,low_frequency_ratio30
3,565A44C3-7F0A-4F50-89AA-7E296917B8FF,7584C122-1E64-47E8-B587-F466A39FD7D4,7922,berkeley_heights,union,nj,rc_s,keyservice,gender_m,ms_s,age_adult,new_user,strongly_engaged,single_camera,immediate,mediumfrequencyratio30
4,677AEC7D-BC45-4D29-B116-3758EA2D15CB,4C70B81B-2A69-4708-A7CE-C1BBCC03E043,40165,shepherdsville,bullitt,ky,rc_s,keyservice,gender_other,ms_other,age_unk,new_user,strongly_engaged,single_camera,immediate,mediumfrequencyratio30


In [61]:
new_data.shape

(19661, 16)

In [62]:
new_data.to_csv('data/train_data_cat.csv',index=False)

In [63]:
data.loc[:,['accountid','userid','target']].to_csv('data/targets.csv',index=False)