## Walkthrough of Data Science - Traveler

### * Goal: Predict the country that users will make their first booking in, based on some basic user profile data.


#### [1] Pre-processing: Assessing and analyzing data, cleaning, transforming and adding new features
#### [2] Learning model: Constructing and testing learning model
#### [3] Post-processing: Creating final predictions


# LAB 1 CODE

In [None]:
##Exploring Traveler data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline 

print("Reading data...")
train_file = "./traveler/train_users_2.csv"
df_train = pd.read_csv(train_file, header = 0,index_col=None)

test_file = "./traveler/test_users.csv"
df_test = pd.read_csv(test_file, header = 0,index_col=None)

# Combining into one dataset for cleaning
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print("Reading data...completed")

# Fixing date formats in Pandas - to_datetime
## Change dates to specific format
print("Fixing timestamps...")
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'], format='%Y-%m-%d')
df_all['timestamp_first_active'] = pd.to_datetime(df_all['timestamp_first_active'], format='%Y%m%d%H%M%S')
print("Fixing timestamps...completed")

## Removing date_first_booking column
df_all.drop('date_first_booking', axis = 1, inplace = True)
print("Droped date_first_booking column...")

## Remove outliers function - [1]
def remove_outliers(df, column, min_val, max_val):
    col_values = df[column].values
    df[column] = np.where(np.logical_or(col_values<=min_val, col_values>=max_val), np.NaN, col_values)
    return df

## Fixing age column - [2]
print("Fixing age column...")
df_all = remove_outliers(df = df_all, column = 'age', min_val = 15, max_val = 90)
df_all['age'].fillna(-1, inplace = True)
print("Fixing age column...completed")

# Other column missing value - Fill first_affiliate_tracked column
print("Filling first_affiliate_tracked column...")
df_all['first_affiliate_tracked'].fillna(-1, inplace=True)
print("Filling first_affiliate_tracked column...completed")

df_all.head()

# LAB 2 CODE

In [None]:
# Own implementation of One Hot Encoding - Data Transformation
def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

# One Hot Encoding
print("One Hot Encoding categorical data...")
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for column in columns_to_convert:
    df_all = convert_to_binary(df=df_all, column_to_convert=column)
    df_all.drop(column, axis=1, inplace=True)
print("One Hot Encoding categorical data...completed")

# Add new date related fields - Creating New Features
print("Adding new fields...")
df_all['day_account_created'] = df_all['date_account_created'].dt.weekday
df_all['month_account_created'] = df_all['date_account_created'].dt.month
df_all['quarter_account_created'] = df_all['date_account_created'].dt.quarter
df_all['year_account_created'] = df_all['date_account_created'].dt.year
df_all['hour_first_active'] = df_all['timestamp_first_active'].dt.hour
df_all['day_first_active'] = df_all['timestamp_first_active'].dt.weekday
df_all['month_first_active'] = df_all['timestamp_first_active'].dt.month
df_all['quarter_first_active'] = df_all['timestamp_first_active'].dt.quarter
df_all['year_first_active'] = df_all['timestamp_first_active'].dt.year
df_all['created_less_active'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).dt.days
print("Adding new fields...completed")


# Drop unnecessary columns
print("Droping fields...")
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
    if column in df_all.columns:
        df_all.drop(column, axis=1, inplace=True)
print("Droping fields...completed")

## Understanding the sessions.csv data
## Loading sessions.csv data
print("Reading sessions data...")
sessions_file = "./traveler/sessions.csv"
df_sessions = pd.read_csv(sessions_file, header = 0,index_col=False)
print("Reading sessions data...completed")

## Cleaning and Transforming the Data
# Determine primary device
print("Determing primary device...")
sessions_device = df_sessions.loc[:, ['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).aggregate(np.sum)
#aggregated_lvl1.head(10)
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
#idx.head(10)
df_sessions_primary = pd.DataFrame(aggregated_lvl1.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
#df_sessions_primary.head(10)
df_sessions_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
#df_sessions_primary.head(10)
# Call user defined One Hot Encoding function
df_sessions_primary = convert_to_binary(df=df_sessions_primary, column_to_convert='primary_device')
#df_sessions_primary.head()
df_sessions_primary.drop('primary_device', axis=1, inplace=True)
#df_sessions_primary.head()
print("Determing primary device...completed")

# Determine Secondary device
print("Determing secondary device...")
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
remaining.head()
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_sessions_secondary = pd.DataFrame(remaining.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_sessions_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_sessions_secondary = convert_to_binary(df=df_sessions_secondary, column_to_convert='secondary_device')
df_sessions_secondary.drop('secondary_device', axis=1, inplace=True)
print("Determing secondary device...completed")

# Determine Counts of Actions - Looping Through the Actions Columns
# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
    id_list = df[id_col].drop_duplicates()

    df_counts = df.loc[:,[id_col, column_to_convert]]
    df_counts['count'] = 1
    df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()

    new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
    new_df = new_df.fillna(0)

# Rename Columns
    categories = list(df[column_to_convert].drop_duplicates())
    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert + '_' + cat_name
        new_df.rename(columns = {category:col_name}, inplace=True)

    return new_df

# Aggregate and combine actions taken columns
print("Aggregating actions taken...")
session_actions = df_sessions.loc[:,['user_id', 'action', 'action_type', 'action_detail']]
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True

for column in columns_to_convert:
    print("Converting " + column + " column...")
    current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)

# If first loop, current data becomes existing data, otherwise merge existing and current
    if first:
        first = False
        actions_data = current_data
    else:
        actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')

# Finally, Combine Data Sets
# [4.1] Merge device datasets - First, combine the two device dataframes (df_primary and df_secondary) to create a device dataframe.
print("Combining results...")
df_sessions_primary.set_index('user_id', inplace=True)
df_sessions_secondary.set_index('user_id', inplace=True)
device_data = pd.concat([df_sessions_primary, df_sessions_secondary], axis=1, join="outer")

# [4.2] Merge device and actions datasets - Then, combine the device dataframe with the actions dataframe to create a sessions dataframe with all the features extracted from sessions.csv
combined_results = pd.concat([device_data, actions_data], axis=1, join='outer')
df_sessions_complete = combined_results.fillna(0)

# [4.3] Merge user and session datasets - Finally, combine the sessions dataframe with the user data dataframe computed earlier
df_all.set_index('id', inplace=True)
df_all = pd.concat([df_all, df_sessions_complete], axis=1, join='inner')
print("Combining results...completed")

df_all.head() # You need get 5 rows × 720 columns

# LAB 3 CODE

In [None]:
## Temp variables created to store data
df_train1 = df_train
df_test1 = df_test
df_all1 = df_all

In [None]:
## Creating a learning model
from sklearn.preprocessing import LabelEncoder

df_train1.set_index('id', inplace=True)
df_train1 = pd.concat([df_train1['country_destination'], 
                       df_all1], axis=1, join='inner')

id_train = df_train1.index.values
labels = df_train1['country_destination']

# Label encoding for the categorical data eg: ...NDF -> 7, US -> 10...
le = LabelEncoder()
y = le.fit_transform(labels)
X = df_train1.drop('country_destination', axis=1, inplace=False)


### Approach 1 to build and test learning model
##### Cross-Validation - Training data split into training and testing data

In [None]:

from sklearn import cross_validation
## Spliting of training dataset into 70% training data and 30% testing data randomly
features_train, features_test, labels_train, labels_test = 
cross_validation.train_test_split(X, y, test_size=0.3, random_state=42)

# Different classification techniques
## Decision Tree 
from sklearn import tree
clf = tree.DecisionTreeClassifier()
### Gaussian Naive Bayes
### from sklearn.naive_bayes import GaussianNB
### clf = GaussianNB()

##SVM
##from sklearn import svm
##clf = svm.SVC(kernel="rbf") 
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
## Computing accuracy
from sklearn.metrics import accuracy_score
print accuracy_score(prediction, labels_test)

### Approach 2 to build and test learning model
##### Cross-Validation - Training data split into training and testing data

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import decomposition, grid_search
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# Grid Search - Used to find best combination of parameters
XGB_model = xgb.XGBClassifier(objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
param_grid = {'max_depth': [5], 'learning_rate': [0.1], 'n_estimators': [5]}

##Note running this step can take a significant amount of time, might take hours as well.
#param_grid = {'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.3], 'n_estimators': [25, 50]} 

model = grid_search.GridSearchCV(estimator=XGB_model, param_grid=param_grid, scoring='accuracy', verbose=10, n_jobs=1, iid=True, refit=True, cv=3)

# The actual model for complete training data
#model.fit(X, y)
model.fit(features_train, labels_train)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Make predictions
y_pred = model.predict(features_test)
#y_gb = model.predict_proba(features_test)
#y_pred_prob = model.predict_proba(features_test) ##select the 5 best predictions

#Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % accuracy_score(labels_test, y_pred)
#print "Log loss : %.4g" % log_loss(labels_test, y_gb)
#print "AUC Score (Train): %f" % roc_auc_score(labels_test, y_pred_prob)
                    

### Approach 3 to build and test learning model
##### Cross-Validation - Training data split into training, validation and testing data

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import log_loss

#fixing random state
random_state=1
#Spliting data into train and test sets.
XA, X_testA, yA, y_testA = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
#Spliting train data into training and validation sets.
X_trainA, X_validA, y_trainA, y_validA = train_test_split(XA, yA, test_size=0.25, random_state=random_state)

print('Data shape:')
print('X_trainA: %s, X_validA: %s, X_testA: %s \n' %(X_trainA.shape, X_validA.shape, X_testA.shape))

#Defining the classifiers
clfs = {'LR'  : LogisticRegression(random_state=random_state), 
        'SVM' : SVC(probability=True, random_state=random_state), 
        'RF'  : RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                       random_state=random_state), 
        'GBM' : GradientBoostingClassifier(n_estimators=50, 
                                           random_state=random_state), 
        'ETC' : ExtraTreesClassifier(n_estimators=100, n_jobs=-1, 
                                     random_state=random_state),
        'KNN' : KNeighborsClassifier(n_neighbors=30)}
    
#predictions on the validation and test sets
p_valid = []
p_test = []
   
print('Performance of individual classifiers on X_testA')   
print('------------------------------------------------------------')
   
for nm, clf in clfs.items():
    #First run. Training on (X_trainA, y_trainA) and predicting on X_validA.
    clf.fit(X_trainA, y_trainA)
    yv = clf.predict_proba(X_validA)
    p_valid.append(yv)
        
    #Second run. Training on (XA, yA) and predicting on X_testA.
    clf.fit(XA, yA)
    yt = clf.predict_proba(X_testA)
    p_test.append(yt)
       
    #Printing out the performance of the classifier
    print('{:10s} {:2s} {:1.7f}'.format('%s: ' %(nm), 'logloss  =>', log_loss(y_testA, yt)))
print('')

#Creating the data for the 2nd layer.
XV = np.hstack(p_valid)
XT = np.hstack(p_test)

#By default the best C parameter is obtained with a cross-validation approach, doing grid search with
#10 values defined in a logarithmic scale between 1e-4 and 1e4.
#Change parameters to see how they affect the final results.
lr = LogisticRegressionCV(Cs=10, dual=False, fit_intercept=True, intercept_scaling=1.0, max_iter=25, multi_class='ovr', n_jobs=1, penalty='l2', random_state=random_state, solver='lbfgs', tol=0.0001)

lr.fit(XV, y_validA)
y_lr = lr.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('Log_Reg:', 'logloss  =>', log_loss(y_testA, y_lr)))

#Gradient boosting
xgb = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=10000, objective='multi:softprob', seed=random_state)
xgb.fit(XV, y_validA, early_stopping_rounds=15, verbose=False)
xgb.n_estimators = xgb.best_iteration
xgb.fit(XV, y_validA)
y_gb = xgb.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('XGB_Reg:', 'logloss  =>', log_loss(y_testA, y_gb)))
