In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import math


# Preprocessing training data
train_users = pd.read_csv('train_users.csv')
train_users = train_users.drop(['date_first_booking'],axis = 1)
train_users = train_users.fillna(-1)
train_users = train_users[((18 <= train_users.age) & (train_users.age <= 100))|(train_users.age == -1)]

# Preprocessing testing data
test_users = pd.read_csv('test_users.csv')
test_users = test_users.drop(['date_first_booking'],axis = 1)
test_users = test_users.fillna(-1)
test_users = test_users[((18 <= test_users.age) & (test_users.age <= 100))|(test_users.age == -1)]

# ===Preprocessing training data===
df = pd.DataFrame()
le = preprocessing.LabelEncoder()
df['age'] = train_users['age']



#====Timestamp First Active======
tfa = np.vstack(map((lambda x: [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]),train_users['timestamp_first_active'].astype(str)))
df['tfa_year'] = tfa[:,0]
df['tfa_month'] = tfa[:,1]
df['tfa_day'] = tfa[:,2]
df['tfa_hour'] = tfa[:,3]
df['tfa_minute'] = tfa[:,4]
df['tfa_seconds'] = tfa[:,5]


# Split date into month, day, year
date = np.vstack(map((lambda x: x.split('-')),train_users.date_account_created.astype(str)))
df['month'] = date[:,0]
df['day'] = date[:,1]
df['year'] = date[:,2]


# Dummy encoding
feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for feature in feats:
    df_dummy = pd.get_dummies(train_users[feature], prefix=feature)
    df = pd.concat((df,df_dummy),axis=1)
df['country_destination']=le.fit_transform(train_users['country_destination'])

country_dic = {0:'AU', 1:'CA', 2:'DE', 3:'ES', 4:'FR', 5:'GB', 6:'IT', 7:'NDF', 8:'NL', 9:'PT', 10:'US', 11:'other'}

# Split training data
trdf = df[0:int(0.8*len(df))]
devdf = df[int(0.8*len(df)):]

# Result plot
def result_plot(model,Z):
    result = pd.DataFrame(columns=['id','destination'])
    result['id'] = test_users['id']
    result['destination'] = Z
    des = result.groupby(['destination']).size()
    des = des.groupby(level=0).apply(lambda x: 100*x/float(des.sum()))
    ax=des.plot(kind = 'bar')
    for p in ax.patches:
        ax.annotate(str("%.2f" %p.get_height())+"%", (p.get_x() * 1.005, p.get_height() * 1.005))
    plt.title(model)
    plt.show()

# Accuracy
def accuracy(Z,pred):
    correct = 0
    for i in range(0,len(pred)):
        if Z[i]== pred[i]:
            correct += 1
    print 'accuracy',float(correct)/len(Z)

# Write out
def sub(Z):
    lcs = list()
    for z in Z:
        d = dict()
        for m,n in enumerate(z): d[m]=n
        d = sorted(d.items(), key=lambda x:x[1],reverse=True)
        l = list()
        l.extend(key for key,value in d[:5])
        lcs.append(l)
    return lcs


# NDCG score
def score(lcs,pred):
    dcgs = []
    for i,lc in enumerate(lcs):
        dcg = 0
        for j,c in enumerate(lc):
            if pred[i] == c: dcg = dcg + (1/(math.log(j+2,2)))
        dcgs.append(dcg)
    print np.mean(dcgs)


X = trdf.as_matrix()[:,:-1].astype(int)
Y = trdf.as_matrix()[:,-1].astype(int)
predict = devdf.as_matrix()[:,:-1].astype(int)



#Logistic Regression
lr = LogisticRegression()
lr.fit(X,Y)
Z = lr.predict_proba(predict)
lcs = sub(Z)
score(lcs,devdf.as_matrix()[:,-1])

# Random Forest
rf = RandomForestClassifier(n_estimators=300,criterion='entropy')
rf.fit(X, Y)
Z = rf.predict_proba(predict)
lcs = sub(Z)
score(lcs,devdf.as_matrix()[:,-1])
#result_plot('Random Forest',Z)




# Decision Tree Model
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X,Y)
Z = dt.predict_proba(predict)
lcs = sub(Z)
score(lcs,devdf.as_matrix()[:,-1])

# feature importance
dic = dict()
l = list()
l.extend(dt.feature_importances_)
for i,f in enumerate(df.columns.values.tolist()[:-1]):
    dic[f] = l[i]


dic = sorted(dic.items(), key=lambda x:x[1],reverse=True)
for item in dic:
    print item


