In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [3]:
def load_data():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_train_path = data_dir + "train_augmented_2.csv"
    #X_test_path = data_dir + "X_test.csv"
    y_train_path = data_dir + "y_train.csv"
    regis_data_path = data_dir + "vehicle_registration_data_2018.csv"
    
    # load dataframes
    X_train = pd.read_csv(X_train_path, index_col='idx')
    X_train.drop(X_train.columns[0], axis=1, inplace=True) # drop repeated index column
    y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
    #regis_data = pd.read_csv(regis_data_path)
    
    return X_train, y_train

## Grouping by unique products

In [4]:
def group_by_product():
    # groupby product skew maybe useful???
    temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups
    for k,values in temp.items():
        print ("Product Skew: ",k)
        for v in values:
            example = X_train.iloc[v]
            break
        break
    example

### CLEAN Null values

In [5]:
# Replace NaNs with 0
def clean_data(X):
    X.fillna(0, inplace=True)
    X.replace([-np.inf, np.inf, np.NaN],0, inplace=True)
    ## transform nominals to numeric codes
    for col in X:
        if X[col].dtype == 'object':
            X[col] = X[col].astype('category').cat.codes
    X[X.select_dtypes(np.int64).columns.values].astype(np.float64, inplace=True)

### K-means clustering 

In [6]:
def cluster_it():
    # Drop rows containing outliers in their numerical columns if desired
    #print ("Dropping outliers...")
    #df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

    # need the number of labels for categorization
    num_labels = 5

    # Fit a kmeans model to the column
    #mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
    mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

    km = KMeans(n_clusters=num_labels)
    km.fit(mat)
    labels = km.labels_ # Get cluster assignment labels

    sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]
    
    # Format results as a DataFrame
    sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
    sales_n_labels['Clusters'] = labels
    #sales_n_labels.head()

## Random Forest Regressor

### Train 

In [16]:
def run_gbr(X,y):
    
    print ("Running Grid Search CV...")
    
    gsc = GridSearchCV(
        estimator=GradientBoostingRegressor(),
        param_grid={
            'min_samples_split':[2,3],
            'learning_rate': [0.0001, 0.001],
            'max_depth': range(1,6),
            'n_estimators': (10,50,100,150),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
    
    print ("Fitting train data to gsc...")
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    gbr = GradientBoostingRegressor(loss='quantile', 
                learning_rate=best_params['learning_rate'],
                max_depth=best_params["max_depth"],
                n_estimators=best_params["n_estimators"],
                random_state=False, verbose=2, max_features='log2')

    print ("Cross validating the model...")
    
    scores = cross_val_score(gbr, X, y, cv=5, scoring='neg_mean_squared_error')

    return gbr, scores, best_params


In [17]:
# Get data
X, y = load_data()
clean_data(X)
y_final = y.copy() # save for submission

  mask |= (ar1 == a)


In [18]:
# NUM EXAMPLES TO USE
num_exs = 50000

In [19]:
X.head()

Unnamed: 0_level_0,DC_ZIPCODE,Invoice_Year,Invoice_Week,CATEGORY,TIER,SPEED_RATING_CODE,RIM_DIAMETER_SIZE_CODE,WIDTH,HEIGHT,AVG_UNIT_WEIGHT,Invoice_Month,SELLING_PRICE,Monthly_Top_1_Customer_Zip,Monthly_Top_2_Customer_Zip,Monthly_Top_3_Customer_Zip,Monthly_Top_4_Customer_Zip,Monthly_Top_5_Customer_Zip,Monthly_Top_6_Customer_Zip,Monthly_Top_7_Customer_Zip,Monthly_Top_8_Customer_Zip,Monthly_Top_9_Customer_Zip,Monthly_Top_10_Customer_Zip,Monthly_Top_1_Customer_Total_Sales,Monthly_Top_2_Customer_Total_Sales,Monthly_Top_3_Customer_Total_Sales,Monthly_Top_4_Customer_Total_Sales,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales,DC_ZIPCODE_POPULATION,population_employed,population_commuting,total_household_income,median_household_income,mean_household_income
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
0,11717,2016,44,4,2,8,13.0,6.89,22.68,14.9,10.0,35.0,11101.0,11756.0,11520.0,11746.0,11358.0,11368.0,11223.0,11203.0,11580.0,11901.0,616508.0,1026564.0,427710.0,574404.0,690830.0,258324.0,235820.0,160756.0,183788.0,114146.0,63207.0,48504.0,31180.0,13587.0,74202.0,93757.0
1,11717,2017,10,4,2,8,13.0,6.89,22.68,14.9,3.0,31.5,11101.0,11756.0,11520.0,11746.0,11358.0,11223.0,11735.0,11229.0,11368.0,11230.0,479672.0,1087224.0,157358.0,342798.0,501052.0,252610.0,107190.0,272140.0,132466.0,127240.0,63207.0,48504.0,31180.0,13587.0,74202.0,93757.0
2,11717,2017,13,4,2,8,13.0,6.89,22.68,14.9,3.0,31.5,11101.0,11756.0,11520.0,11746.0,11358.0,11223.0,11735.0,11229.0,11368.0,11230.0,479672.0,1087224.0,157358.0,342798.0,501052.0,252610.0,107190.0,272140.0,132466.0,127240.0,63207.0,48504.0,31180.0,13587.0,74202.0,93757.0
3,11717,2017,19,4,2,1,13.0,6.89,22.64,13.19,5.0,50.0,11520.0,11101.0,11746.0,11756.0,11358.0,11223.0,11520.0,11229.0,11230.0,10801.0,248734.0,314464.0,295766.0,386680.0,217224.0,187662.0,117520.0,50656.0,55520.0,86772.0,63207.0,48504.0,31180.0,13587.0,74202.0,93757.0
4,11717,2017,25,4,2,8,13.0,6.89,22.68,14.9,6.0,31.5,11101.0,11520.0,11746.0,11368.0,11358.0,11756.0,11223.0,11580.0,11229.0,11230.0,460292.0,260474.0,396232.0,259164.0,378726.0,525926.0,246766.0,155972.0,83740.0,73248.0,63207.0,48504.0,31180.0,13587.0,74202.0,93757.0


In [20]:
y.head()

Unnamed: 0_level_0,UNITS
idx,Unnamed: 1_level_1
0,8.0
1,4.0
2,8.0
3,2.0
4,4.0


In [21]:
# train test split
X_s, y_s = X.iloc[:num_exs], y.iloc[:num_exs]
X_train, X_test, y_train, y_test = train_test_split(X_s,y_s,test_size=0.2,shuffle=True)

In [None]:
# run Model
model, score, best_params = run_gbr(X_train,y_train)

Running Grid Search CV...
Fitting train data to gsc...
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 10.4min


In [None]:
# Fit the GBR to the training data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True)
gbr.fit(X_train,y_train)    

In [None]:
# MSE for GBR
mse = mean_squared_error(y_test[:num_exs], gbr.predict(X_test[:num_exs]))
sqrt(mse)

In [None]:
#result = gbr.predict(X_test[num_cols]).round()
#upload_leaderboard(result)

In [None]:
def submit_score(predictions, team_key):
    """
    Submit your predictions for scoring

    Args:
        predictions (DataFrame): Pandas DataFrame containing the following required
            column:
                1. idx (int) - The unique identifier for each observation
                2. predictions (float) - Your predicted value
        team_key (str): Your team's unique identifier

    Returns:
        Response: Flask Response object. See the Response.text field to get the score
            from your latest submission. Your best score will be reflected on the
            leaderboard
    """

    import requests
    import json
    import numpy
    def default(o):
        if isinstance(o, numpy.int64):
            return int(o)
        raise TypeError

    API_ENDPOINT = "http://coe-hackathon-dot-atd-fn-anacoe-dev.appspot.com/submitscore"
    payload = {
        "team_key": team_key,
        "data": predictions.loc[:, ["idx", "predictions"]].to_dict(orient="records")
    }
    resp = requests.post(
        API_ENDPOINT,
        data=json.dumps(payload, default=default),
        headers={'Content-Type': 'application/json'}
    )
    
    if resp.status_code == 404:
        print(resp.json()['error'])
        return None
    
    elif resp.status_code != 200:
        raise ValueError('There was an error processing your request: '
                         '\n{}'.format(resp.text))
        return None
    else:
        score = resp.json()['score']
        print('Submission successful! Your score was \n{}'.format(score))
        return score

def upload_leaderboard(upload_predictions):
    teamkey = '$pbkdf2-sha512$25000$BOA8p/S.N6aUktK6d6415g$5KWTQmlXfm30T4H/c/Vo8Tiacfqd/qst5n3nU9JLSyV3fqZxerFbbCYYwCc3KVlOyv1pnvGJga7CU/CMzM6yNw'
    upload_dataframe = pd.DataFrame()
    upload_dataframe['idx'] = range(upload_predictions.size)
    upload_dataframe['predictions'] = np.array(upload_predictions)
    upload_dataframe['predictions'] = upload_dataframe['predictions']
    upload_dataframe.shape
    submit_score(upload_dataframe[['idx','predictions']],teamkey)

In [None]:
# (OLD) Gradient Boosting Regressor Model
def gbr_model(X, y):
# Perform Grid-Search
    print ("Starting grid search...")
    gsc = GridSearchCV(
        estimator=GradientBoostingRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10,50,100),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=6)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    gbr = GradientBoostingRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=2)
    # Perform K-Fold CV
    scores = cross_val_score(gbr, X, y, cv=10, scoring='neg_mean_absolute_error')

    return gbr, scores