In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [18]:
def load_data():
    # paths to data
    data_dir = os.getcwd() + "/data/"
    X_train_path = data_dir + "X_train.csv"
    X_test_path = data_dir + "X_test.csv"
    y_train_path = data_dir + "y_train.csv"
    regis_data_path = data_dir + "vehicle_registration_data_2018.csv"
    
    # load dataframes
    X_train = pd.read_csv(X_train_path, index_col='idx')
    X_train.drop(X_train.columns[0], axis=1, inplace=True) # drop repeated index column
    y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
    #regis_data = pd.read_csv(regis_data_path)
    
    return X_train, y_train

## Grouping by unique products

In [4]:
def group_by_product():
    # groupby product skew maybe useful???
    temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups
    for k,values in temp.items():
        print ("Product Skew: ",k)
        for v in values:
            example = X_train.iloc[v]
            break
        break
    example

### CLEAN Null values

In [5]:
# Replace NaNs with 0
def clean_X_train(X_train):
    X_train.fillna(0, inplace=True)
    X_train.replace([-np.inf, np.inf, np.NaN],0, inplace=True)

### K-means clustering 

In [6]:
def cluster_it():
    # Drop rows containing outliers in their numerical columns if desired
    #print ("Dropping outliers...")
    #df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

    # need the number of labels for categorization
    num_labels = 5

    # Fit a kmeans model to the column
    #mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
    mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

    km = KMeans(n_clusters=num_labels)
    km.fit(mat)
    labels = km.labels_ # Get cluster assignment labels

    sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]
    
    # Format results as a DataFrame
    sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
    sales_n_labels['Clusters'] = labels
    #sales_n_labels.head()

## Random Forest Regressor

In [7]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10,50,100),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=6)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=False)
    # Perform K-Fold CV
    scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')

    return rfr, scores

In [19]:
X, y = load_data()
clean_X_train(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True)

  mask |= (ar1 == a)


(2019036, 32)


In [9]:
# param search for a good model on the numerical cols
num_cols = list(X_train.select_dtypes(include=[np.float64]).columns.values)
rfr, score = rfr_model(X_train[num_cols],y_train['UNITS'])

In [10]:
rfr.fit(X_train[num_cols],y_train)
#rfr.feature_importances_

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=False, verbose=False,
           warm_start=False)

In [11]:
rfr.feature_importances_

array([0.02531786, 0.0468197 , 0.09865013, 0.21840244, 0.00256366,
       0.39532169, 0.00249826, 0.0014098 , 0.00470695, 0.00299674,
       0.00181036, 0.00448983, 0.00346484, 0.00707904, 0.00456061,
       0.00667375, 0.01462937, 0.02736509, 0.00748834, 0.02128315,
       0.01237328, 0.01667222, 0.01313375, 0.0099905 , 0.02237902,
       0.02791963])

In [27]:
X_train.head()

Unnamed: 0_level_0,DC_ZIPCODE,Invoice_Year,Invoice_Week,CATEGORY,TIER,SPEED_RATING_CODE,RIM_DIAMETER_SIZE_CODE,WIDTH,HEIGHT,AVG_UNIT_WEIGHT,Invoice_Month,SELLING_PRICE,Monthly_Top_1_Customer_Zip,Monthly_Top_2_Customer_Zip,Monthly_Top_3_Customer_Zip,Monthly_Top_4_Customer_Zip,Monthly_Top_5_Customer_Zip,Monthly_Top_6_Customer_Zip,Monthly_Top_7_Customer_Zip,Monthly_Top_8_Customer_Zip,Monthly_Top_9_Customer_Zip,Monthly_Top_10_Customer_Zip,Monthly_Top_1_Customer_Total_Sales,Monthly_Top_2_Customer_Total_Sales,Monthly_Top_3_Customer_Total_Sales,Monthly_Top_4_Customer_Total_Sales,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
18726,11717,2016,15,Light Truck,Tier 4,T,17.0,10.71,30.16,38.23,4.0,85.0,11101.0,11756.0,11746.0,11520.0,11435.0,11223.0,11101.0,11358.0,11520.0,11756.0,349532.0,429622.0,284626.0,181112.0,128024.0,142988.0,124788.0,84434.0,144120.0,232804.0
650874,33025,2016,28,Performance Sport,Tier 3,T,15.0,9.47,26.55,26.0,7.0,90.0,33166.0,33142.0,33311.0,33030.0,33186.0,33020.0,33314.0,33023.0,33050.0,33316.0,1288290.0,242014.0,251076.0,185656.0,139324.0,209720.0,172454.0,128658.0,143926.0,64534.0
1149782,65803,2017,18,Passenger Car / Mini-Van,Tier 4,T,15.0,8.15,25.47,20.65,5.0,46.833333,65807.0,65802.0,65536.0,65109.0,65804.0,72653.0,65803.0,65775.0,65616.0,65065.0,107904.0,149696.0,131898.0,86634.0,79874.0,91136.0,61806.0,86092.0,58190.0,65002.0
165776,21220,2017,50,Light Truck,Tier 2,Q,17.0,10.7,31.6,48.5,12.0,173.75,21220.0,21701.0,21030.0,21704.0,21220.0,21014.0,21234.0,21154.0,21093.0,17015.0,1329688.0,703682.0,319888.0,945746.0,187568.0,231272.0,147976.0,168842.0,114804.0,169780.0
1659506,90221,2016,26,SUV CUV,Tier 2,T,17.0,10.71,30.6,38.0,7.0,146.5,90064.0,90045.0,90248.0,90807.0,90232.0,90280.0,90250.0,90230.0,90028.0,90034.0,161080.0,165518.0,113306.0,267540.0,101814.0,92498.0,79358.0,65598.0,95468.0,72518.0


In [13]:
mse = mean_squared_error(y_test[:num_exs], rfr.predict(X_test[num_cols][:num_exs]))
sqrt(mse)

7.608167769814642

In [24]:
score

array([-4.97362375, -5.04762066, -5.08879917, -4.9137653 , -5.00312944,
       -5.01480042, -4.99337728, -5.00065359, -4.82416934, -4.94811344])

In [22]:
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)

(456460, 34)