In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
# options
pd.options.display.max_columns = None # show all columns

In [3]:
# paths to data
data_dir = os.getcwd() + "/data/"
X_train_path = data_dir + "X_train.csv"
X_test_path = data_dir + "X_test.csv"
y_train_path = data_dir + "y_train.csv"
regis_data_path = data_dir + "vehicle_registration_data_2018.csv"

In [5]:
# Load the data into frames
X_train = pd.read_csv(X_train_path, index_col=0)
#y_train = pd.read_csv(y_train_path, usecols=['idx','UNITS'], index_col='idx')
#y_train.drop(y_train.columns[0],axis=1,inplace=True)
#regis_data = pd.read_csv(regis_data_path)

# Explore X_train

In [6]:
# Value counts for CATEGORY
X_train['CATEGORY'].value_counts()

Light Truck                 524200
Performance Sport           509346
Passenger Car / Mini-Van    492519
SUV CUV                     438017
Winter                       44910
Luxury Touring                6417
HT SUV CUV                    1361
AT Mud Utility                 963
Performance Sport Car          906
Standard Passenger             397
Name: CATEGORY, dtype: int64

In [7]:
# check out the columns and types in X_train
X_train.dtypes

idx                                      int64
DC_ZIPCODE                               int64
Invoice_Year                             int64
Invoice_Week                             int64
CATEGORY                                object
TIER                                    object
SPEED_RATING_CODE                       object
RIM_DIAMETER_SIZE_CODE                 float64
WIDTH                                  float64
HEIGHT                                 float64
AVG_UNIT_WEIGHT                        float64
Invoice_Month                          float64
SELLING_PRICE                          float64
Monthly_Top_1_Customer_Zip             float64
Monthly_Top_2_Customer_Zip             float64
Monthly_Top_3_Customer_Zip             float64
Monthly_Top_4_Customer_Zip             float64
Monthly_Top_5_Customer_Zip             float64
Monthly_Top_6_Customer_Zip             float64
Monthly_Top_7_Customer_Zip             float64
Monthly_Top_8_Customer_Zip             float64
Monthly_Top_9

## Grouping by unique products

In [8]:
# groupby product skew maybe useful???
temp = X_train.groupby(['CATEGORY','TIER','SPEED_RATING_CODE','RIM_DIAMETER_SIZE_CODE']).groups

In [10]:
# using 
for k,values in temp.items():
    print ("Product Skew: ",k)
    for v in values:
        example = X_train.iloc[v]
        break
    break
example

Product Skew:  ('AT Mud Utility', 'Tier 1', 'Q', 17.0)


idx                                           1323781
DC_ZIPCODE                                      73179
Invoice_Year                                     2016
Invoice_Week                                       13
CATEGORY                               AT Mud Utility
TIER                                           Tier 1
SPEED_RATING_CODE                                   Q
RIM_DIAMETER_SIZE_CODE                             17
WIDTH                                             9.8
HEIGHT                                           30.8
AVG_UNIT_WEIGHT                                  50.4
Invoice_Month                                       4
SELLING_PRICE                                   125.5
Monthly_Top_1_Customer_Zip                      73127
Monthly_Top_2_Customer_Zip                      73644
Monthly_Top_3_Customer_Zip                      73401
Monthly_Top_4_Customer_Zip                      73135
Monthly_Top_5_Customer_Zip                      73018
Monthly_Top_6_Customer_Zip  

### CLEAN Null values

In [11]:
# The rows with bad/empty/null/missing data
for col in X_train.columns:
    print (col, X_train[col].isna().sum())

idx 0
DC_ZIPCODE 0
Invoice_Year 0
Invoice_Week 0
CATEGORY 0
TIER 0
SPEED_RATING_CODE 0
RIM_DIAMETER_SIZE_CODE 0
WIDTH 0
HEIGHT 0
AVG_UNIT_WEIGHT 0
Invoice_Month 0
SELLING_PRICE 1642
Monthly_Top_1_Customer_Zip 0
Monthly_Top_2_Customer_Zip 0
Monthly_Top_3_Customer_Zip 2
Monthly_Top_4_Customer_Zip 2
Monthly_Top_5_Customer_Zip 2
Monthly_Top_6_Customer_Zip 2
Monthly_Top_7_Customer_Zip 4
Monthly_Top_8_Customer_Zip 4
Monthly_Top_9_Customer_Zip 4
Monthly_Top_10_Customer_Zip 4
Monthly_Top_1_Customer_Total_Sales 4
Monthly_Top_2_Customer_Total_Sales 4
Monthly_Top_3_Customer_Total_Sales 65
Monthly_Top_4_Customer_Total_Sales 65
Monthly_Top_5_Customer_Total_Sales 96
Monthly_Top_6_Customer_Total_Sales 96
Monthly_Top_7_Customer_Total_Sales 101
Monthly_Top_8_Customer_Total_Sales 101
Monthly_Top_9_Customer_Total_Sales 101
Monthly_Top_10_Customer_Total_Sales 101


In [12]:
# Replace NaNs with 0
X_train.replace(np.NaN, 0, inplace=True)

In [13]:
# rows containing nulls (None now)
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0,idx,DC_ZIPCODE,Invoice_Year,Invoice_Week,CATEGORY,TIER,SPEED_RATING_CODE,RIM_DIAMETER_SIZE_CODE,WIDTH,HEIGHT,AVG_UNIT_WEIGHT,Invoice_Month,SELLING_PRICE,Monthly_Top_1_Customer_Zip,Monthly_Top_2_Customer_Zip,Monthly_Top_3_Customer_Zip,Monthly_Top_4_Customer_Zip,Monthly_Top_5_Customer_Zip,Monthly_Top_6_Customer_Zip,Monthly_Top_7_Customer_Zip,Monthly_Top_8_Customer_Zip,Monthly_Top_9_Customer_Zip,Monthly_Top_10_Customer_Zip,Monthly_Top_1_Customer_Total_Sales,Monthly_Top_2_Customer_Total_Sales,Monthly_Top_3_Customer_Total_Sales,Monthly_Top_4_Customer_Total_Sales,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales


### K-means clustering 

In [14]:
# Drop rows containing outliers in their numerical columns if desired
#print ("Dropping outliers...")
#df = df[(np.abs(stats.zscore(df[cols_to_convert])) < 3).all(axis=1)]

In [15]:

# need the number of labels for categorization
num_labels = 5

# Fit a kmeans model to the column
#mat = df[col].values.reshape(-1,1) # convert dataframe col to matrix
mat = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':].values

km = KMeans(n_clusters=num_labels)
km.fit(mat)
labels = km.labels_ # Get cluster assignment labels
labels

array([3, 3, 3, ..., 0, 0, 0], dtype=int32)

In [27]:
sales_cols = ['Monthly_Top_{}_Customer_Total_Sales'.format(i) for i in range(1,11)]

In [30]:
# Format results as a DataFrame
sales_n_labels = X_train.loc[:,'Monthly_Top_1_Customer_Total_Sales':]
sales_n_labels['Clusters'] = labels

# get the order of means of the clusters so we can assign the correct labels
#cluster_means = results.groupby('Cluster').mean()
#cluster_means.sort_values(col, inplace=True)

# add the labels to the cluster nums
#for i, label in enumerate(attrs[col]): cluster_means.iloc[i] = label

In [38]:
sales_n_labels.groupby('Clusters').mean()

Unnamed: 0_level_0,Monthly_Top_1_Customer_Total_Sales,Monthly_Top_2_Customer_Total_Sales,Monthly_Top_3_Customer_Total_Sales,Monthly_Top_4_Customer_Total_Sales,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales
Clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,181915.4,139142.070784,121236.398198,104823.50516,95244.410159,87236.453259,79905.665111,73871.171734,69334.286544,64442.346941
1,1269025.0,469184.952363,345028.00881,390181.370734,219326.969312,167754.536006,163544.996245,153746.470791,129756.063915,135901.98624
2,331922.8,252722.999146,199498.453892,177738.516406,158996.079365,143972.316385,135850.176213,132613.037371,118777.106319,110382.985323
3,494692.7,660849.478395,730296.595534,357182.454635,283688.673369,201854.93305,186705.291561,171074.133668,173191.927231,145359.265307
4,679360.7,328257.171004,271686.488316,228222.558309,198475.440667,192131.206417,171125.903269,168092.053037,150279.011289,138462.60996


# Explore y_train Data

In [None]:
# negative units sold
y_train[y_train.values < 0].head()

### Explore Vehicle Registration Data

In [None]:
regis_data.head()