In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [11]:
def load_data():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_train_path = data_dir + "X_train.csv"
    X_test_path = data_dir + "X_test.csv"
    y_train_path = data_dir + "y_train.csv"
    regis_data_path = data_dir + "vehicle_registration_data_2018.csv"
    
    # load dataframes
    X_train = pd.read_csv(X_train_path, index_col='idx')
    X_train.drop(X_train.columns[0], axis=1, inplace=True) # drop repeated index column
    y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
    #regis_data = pd.read_csv(regis_data_path)
    
    return X_train, y_train

## Grouping by unique products

In [4]:
def group_by_product():
    # groupby product skew maybe useful???
    temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups
    for k,values in temp.items():
        print ("Product Skew: ",k)
        for v in values:
            example = X_train.iloc[v]
            break
        break
    example

### CLEAN Null values

In [13]:
# Replace NaNs with 0
def clean_X_train(X_train):
    X_train.fillna(0, inplace=True)
    X_train.replace([-np.inf, np.inf, np.NaN],0, inplace=True)

### K-means clustering 

In [7]:
def cluster_it():
    # Drop rows containing outliers in their numerical columns if desired
    #print ("Dropping outliers...")
    #df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

    # need the number of labels for categorization
    num_labels = 5

    # Fit a kmeans model to the column
    #mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
    mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

    km = KMeans(n_clusters=num_labels)
    km.fit(mat)
    labels = km.labels_ # Get cluster assignment labels

    sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]
    
    # Format results as a DataFrame
    sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
    sales_n_labels['Clusters'] = labels
    #sales_n_labels.head()

## Random Forest Regressor

In [8]:
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [9]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10,50,100,1000),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=6)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=False)
    # Perform K-Fold CV
    scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')

    return scores

In [14]:
# Numeric columns
X_train, y_train = load_data()
clean_X_train(X_train)

num_cols = list(X_train.select_dtypes(include=[np.float64]).columns.values)
rfr_model(X_train[num_cols][:50000],y_train['UNITS'][:50000])

  mask |= (ar1 == a)


array([-6.06386053, -7.02158898, -6.72827336, -5.2768255 , -5.30859079,
       -5.05447867, -4.43408221, -4.73084366, -4.06220471, -5.08592357])

(2019036, 32)