In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [3]:
def load_data():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_train_path = data_dir + "train.csv"
    #X_test_path = data_dir + "X_test.csv"
    y_train_path = data_dir + "y_train.csv"
    regis_data_path = data_dir + "vehicle_registration_data_2018.csv"
    
    # load dataframes
    X_train = pd.read_csv(X_train_path, index_col='idx')
    X_train.drop(X_train.columns[0], axis=1, inplace=True) # drop repeated index column
    y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
    #regis_data = pd.read_csv(regis_data_path)
    
    return X_train, y_train

In [4]:
def load_test():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_test_path = data_dir + "test.csv"
    
    # load dataframes
    X_test = pd.read_csv(X_test_path, index_col='idx')
    X_test.drop(X_test.columns[0], axis=1, inplace=True) # drop repeated index column
    
    return X_test

## Grouping by unique products

In [5]:
def group_by_product():
    # groupby product skew maybe useful???
    temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups
    for k,values in temp.items():
        print ("Product Skew: ",k)
        for v in values:
            example = X_train.iloc[v]
            break
        break
    example

### CLEAN Null values

In [6]:
# Replace NaNs with 0
def clean_data(X):
    X.fillna(0, inplace=True)
    X.replace([-np.inf, np.inf, np.NaN],0, inplace=True)
    ## transform nominals to numeric codes
    for col in X:
        if X[col].dtype == 'object':
            X[col] = X[col].astype('category').cat.codes
    X[X.select_dtypes(np.int64).columns.values].astype(np.float64, inplace=True)

### K-means clustering 

In [7]:
def cluster_it():
    # Drop rows containing outliers in their numerical columns if desired
    #print ("Dropping outliers...")
    #df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

    # need the number of labels for categorization
    num_labels = 5

    # Fit a kmeans model to the column
    #mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
    mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

    km = KMeans(n_clusters=num_labels)
    km.fit(mat)
    labels = km.labels_ # Get cluster assignment labels

    sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]
    
    # Format results as a DataFrame
    sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
    sales_n_labels['Clusters'] = labels
    #sales_n_labels.head()

###

In [8]:
def run_xgbr(X,y):
    print ("Running Random Search CV...")

    
    rsc = RandomizedSearchCV(
        estimator=XGBRegressor(),
        param_distributions={
            'n_estimators': [50,100,150,200,300,500],
            'learning_rate':[0.0001,0.001,0.01,0.1],
            'max_depth':[5,7,9],
            'min_samples_split':[3,5,7],
        },
        cv=5, scoring='neg_mean_squared_error', n_iter=20, verbose=2, n_jobs=7)

    print ("Fitting train data to xgbr...")

    grid_result = rsc.fit(X, y)
    best_params = grid_result.best_params_

    xgbr = XGBRegressor(loss='error', 
                learning_rate=best_params["learning_rate"],
                max_depth=best_params["max_depth"],
                n_estimators=best_params["n_estimators"],
                min_samples_split=best_params["min_samples_split"],
                random_state=7,
                max_features='log2',
                nthread=7,
                silent=0)

    print ("Cross validating the model...")

    scores = cross_val_score(xgbr, X, y, cv=5, scoring='neg_mean_squared_error')
    
    return xgbr, scores, best_params
    

In [9]:
from xgboost import XGBRegressor
# Get data
X, y = load_data()
clean_data(X)
X.drop('UNITS', axis=1,inplace=True)


# split data into train and test sets
seed = 7
test_size = 0.33

# 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, shuffle=True)
X_train, y_train = X_train[:200000], y_train[:200000]

model, scores, best_params = run_xgbr(X_train,y_train)
print (best_params)

# fit model no training data
#model = XGBRegressor(eval_metric="error", 
#                learning_rate=0.01,
#                max_depth=10,
#                alpha=.5,
#                n_estimators=500,
#                min_samples_split=3,
#                random_state=7,
#                nthread=7,# was 12
#                silent=0)
model.fit(X_train, y_train, verbose=True)

# make predictions for test data
rmse = sqrt(mean_squared_error(y_test, model.predict(X_test)))
rmse
#predictions = [round(value) for value in y_pred]

  mask |= (ar1 == a)


Running Random Search CV...
Fitting train data to xgbr...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed: 16.8min
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed: 77.6min finished


Cross validating the model...
{'n_estimators': 500, 'min_samples_split': 3, 'max_depth': 9, 'learning_rate': 0.1}


6.910110293019332

### Best Params: {'n_estimators': 500, 'min_samples_split': 3, 'max_depth': 9, 'learning_rate': 0.1}

In [13]:
# save it
import pickle
pickle.dump(model, open("super_xgbr.pickle.dat", "wb"))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, shuffle=True)
X_train, y_train = X_train[:200000], y_train[:200000]
model.fit(X_train,y_train,verbose=True)
sqrt(mean_squared_error(y_test, model.predict(X_test)))

6.910110293019332

In [26]:
# load pickle
with open(os.getcwd()+"/best_xgbr.pickle.dat", "rb") as input_file:
    best_model = pickle.load(input_file)

In [27]:
X_test = load_test()
clean_data(X_test)
result = model.predict(X_test).round()

In [28]:
upload_leaderboard(result)

Submission successful! Your score was 
7.425840591790698


In [16]:
def submit_score(predictions, team_key):
    """
    Submit your predictions for scoring

    Args:
        predictions (DataFrame): Pandas DataFrame containing the following required
            column:
                1. idx (int) - The unique identifier for each observation
                2. predictions (float) - Your predicted value
        team_key (str): Your team's unique identifier

    Returns:
        Response: Flask Response object. See the Response.text field to get the score
            from your latest submission. Your best score will be reflected on the
            leaderboard
    """

    import requests
    import json
    import numpy
    def default(o):
        if isinstance(o, numpy.int64):
            return int(o)
        raise TypeError

    API_ENDPOINT = "http://coe-hackathon-dot-atd-fn-anacoe-dev.appspot.com/submitscore"
    payload = {
        "team_key": team_key,
        "data": predictions.loc[:, ["idx", "predictions"]].to_dict(orient="records")
    }
    resp = requests.post(
        API_ENDPOINT,
        data=json.dumps(payload, default=default),
        headers={'Content-Type': 'application/json'}
    )
    
    if resp.status_code == 404:
        print(resp.json()['error'])
        return None
    
    elif resp.status_code != 200:
        raise ValueError('There was an error processing your request: '
                         '\n{}'.format(resp.text))
        return None
    else:
        score = resp.json()['score']
        print('Submission successful! Your score was \n{}'.format(score))
        return score

def upload_leaderboard(upload_predictions):
    teamkey = '$pbkdf2-sha512$25000$BOA8p/S.N6aUktK6d6415g$5KWTQmlXfm30T4H/c/Vo8Tiacfqd/qst5n3nU9JLSyV3fqZxerFbbCYYwCc3KVlOyv1pnvGJga7CU/CMzM6yNw'
    upload_dataframe = pd.DataFrame()
    upload_dataframe['idx'] = range(upload_predictions.size)
    upload_dataframe['predictions'] = np.array(upload_predictions)
    upload_dataframe['predictions'] = upload_dataframe['predictions']
    upload_dataframe.shape
    submit_score(upload_dataframe[['idx','predictions']],teamkey)