In [1]:
import os, pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_predict, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [3]:
def load_data():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_train_path = data_dir + "X_train.csv"
    #X_test_path = data_dir + "X_test.csv"
    y_train_path = data_dir + "y_train.csv"
    regis_data_path = data_dir + "vehicle_registration_data_2018.csv"
    
    # load dataframes
    X_train = pd.read_csv(X_train_path, index_col='idx')
    X_train.drop(X_train.columns[0], axis=1, inplace=True) # drop repeated index column
    y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
    #regis_data = pd.read_csv(regis_data_path)
    
    return X_train, y_train

In [4]:
def load_test():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_test_path = data_dir + "X_test.csv"
    
    # load dataframes
    X_test = pd.read_csv(X_test_path, index_col='idx')
    X_test.drop(X_test.columns[0], axis=1, inplace=True) # drop repeated index column
    
    return X_test

### Submission code

In [5]:
def submit_score(predictions, team_key):
    """
    Submit your predictions for scoring

    Args:
        predictions (DataFrame): Pandas DataFrame containing the following required
            column:
                1. idx (int) - The unique identifier for each observation
                2. predictions (float) - Your predicted value
        team_key (str): Your team's unique identifier

    Returns:
        Response: Flask Response object. See the Response.text field to get the score
            from your latest submission. Your best score will be reflected on the
            leaderboard
    """

    import requests
    import json
    import numpy
    def default(o):
        if isinstance(o, numpy.int64):
            return int(o)
        raise TypeError

    API_ENDPOINT = "http://coe-hackathon-dot-atd-fn-anacoe-dev.appspot.com/submitscore"
    payload = {
        "team_key": team_key,
        "data": predictions.loc[:, ["idx", "predictions"]].to_dict(orient="records")
    }
    resp = requests.post(
        API_ENDPOINT,
        data=json.dumps(payload, default=default),
        headers={'Content-Type': 'application/json'}
    )
    
    if resp.status_code == 404:
        print(resp.json()['error'])
        return None
    
    elif resp.status_code != 200:
        raise ValueError('There was an error processing your request: '
                         '\n{}'.format(resp.text))
        return None
    else:
        score = resp.json()['score']
        print('Submission successful! Your score was \n{}'.format(score))
        return score

def upload_leaderboard(upload_predictions):
    teamkey = '$pbkdf2-sha512$25000$BOA8p/S.N6aUktK6d6415g$5KWTQmlXfm30T4H/c/Vo8Tiacfqd/qst5n3nU9JLSyV3fqZxerFbbCYYwCc3KVlOyv1pnvGJga7CU/CMzM6yNw'
    upload_dataframe = pd.DataFrame()
    upload_dataframe['idx'] = range(upload_predictions.size)
    upload_dataframe['predictions'] = np.array(upload_predictions)
    upload_dataframe['predictions'] = upload_dataframe['predictions']
    upload_dataframe.shape
    submit_score(upload_dataframe[['idx','predictions']],teamkey)

## Grouping by unique products

In [6]:
def group_by_product():
    # groupby product skew maybe useful???
    temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups
    for k,values in temp.items():
        print ("Product Skew: ",k)
        for v in values:
            example = X_train.iloc[v]
            break
        break
    example

### CLEAN Null values

In [7]:
# Replace NaNs with 0
def clean_data(X):
    X.fillna(0, inplace=True)
    X.replace([-np.inf, np.inf, np.NaN],0, inplace=True)
    ## transform nominals to numeric codes
    for col in X:
        if X[col].dtype != np.float64:
            X.drop(col, axis=1, inplace=True)
    X[X.select_dtypes(np.int64).columns.values].astype(np.float64, inplace=True)

### K-means clustering 

In [8]:
def cluster_it():
    # Drop rows containing outliers in their numerical columns if desired
    #print ("Dropping outliers...")
    #df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

    # need the number of labels for categorization
    num_labels = 5

    # Fit a kmeans model to the column
    #mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
    mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

    km = KMeans(n_clusters=num_labels)
    km.fit(mat)
    labels = km.labels_ # Get cluster assignment labels

    sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]
    
    # Format results as a DataFrame
    sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
    sales_n_labels['Clusters'] = labels
    #sales_n_labels.head()

## Random Forest Regressor

In [None]:
def rfr_model(X, y):
# Perform Grid-Search
    print ("Running Random Search CV...")
    rsc = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions={
            'n_estimators': [50,100,150,200,300,500],
            'max_depth':[5,7,9],
            'min_samples_split':[3,5,7],
        },
        cv=3, scoring='neg_mean_squared_error', n_iter=10, verbose=2, n_jobs=7)

    print ("Fitting train data to rfr...")

    grid_result = rsc.fit(X, y)
    best_params = grid_result.best_params_

    rfr = RandomForestRegressor(
                max_depth=best_params["max_depth"],
                n_estimators=best_params["n_estimators"],
                min_samples_split=best_params["min_samples_split"],
                random_state=7,
                max_features='log2',
                nthread=7,
                silent=0)

    print ("Cross validating the model...")

    scores = cross_val_score(rfr, X, y, cv=5, scoring='neg_mean_squared_error')
    
    return rfr, scores, best_params

In [None]:
# load data
# Get data
X, y = load_data()
clean_data(X)
X.drop(['Invoice_Month', 'RIM_DIAMETER_SIZE_CODE','WIDTH','HEIGHT'], axis=1, inplace=True)

# split data into train and test sets
seed = 7
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, shuffle=True)

  mask |= (ar1 == a)


In [None]:
# ALREADY FOUND BEST PARAMS
rfr = RandomForestRegressor(max_depth=5, n_estimators=1000, random_state=False, verbose=2)

In [None]:
# param search for a good model on the numerical cols
#rfr, score, best_params = rfr_model(X_train,y_train)

In [None]:
rfr.fit(X_train,y_train)

  """Entry point for launching an IPython kernel.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 1000


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.0s remaining:    0.0s


building tree 2 of 1000
building tree 3 of 1000
building tree 4 of 1000
building tree 5 of 1000
building tree 6 of 1000
building tree 7 of 1000


In [None]:
rfr.feature_importances_

In [None]:
# save pickle
pickle.dump(rfr, open("og_rfr.pickle.dat", "wb"))
# load pickle
#with open(os.getcwd()+"/og_rfr.pickle.dat", "rb") as input_file:
#    rfr = pickle.load(input_file)

In [None]:
mse = mean_squared_error(y_test, rfr.predict(X_test))
sqrt(mse)

In [None]:
X_test = load_test()
X_test.drop(['Invoice_Month', 'RIM_DIAMETER_SIZE_CODE','WIDTH','HEIGHT'], axis=1, inplace=True)
clean_data(X_test)
result = rfr.predict(X_test).round()

In [None]:
upload_leaderboard(result)