In [1]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from datetime import datetime
import calendar

def transform_order_date(order_date):
    #order_date = order_date_column.values
    order_date = order_date.apply(datetime.fromisoformat)
    day_of_week = []
    day_of_month = []
    day_of_year = []
    for date in order_date:
        dow = date.weekday()
        date_day = date.day-1
        doy = date.timetuple().tm_yday - 1 # starts from 1
        month_length = calendar.monthrange(date.year, date.month)[1]
        year_length = 365 if not calendar.isleap(date.year) else 366
        day_of_week.append(2*np.pi*dow / 7)
        day_of_month.append(2*np.pi*date_day / month_length)
        day_of_year.append(2*np.pi*doy / year_length)
    dow = np.array(day_of_week)
    dom = np.array(day_of_month)
    doy = np.array(day_of_year)
    return np.cos(dow), np.sin(dow), np.cos(dom), np.sin(dom), np.cos(doy), np.sin(doy)


dataset = pd.read_csv('data/LUMEN_DS.csv', sep='|', quotechar='"', encoding='UTF-16LE')

uninformative_columns = [
    'CustomerID',
     'Item Code',
     'Invoice #',
     'Order #',
     'Invoiced price (TX)'
]

NA_THRESHOLD = 0.1
na_share = dataset.isna().mean()
na_columns = na_share[na_share > NA_THRESHOLD].index.tolist()

#all_columns = make_column_selector('.*')


mapper = DataFrameMapper([], default=None, drop_cols=uninformative_columns+na_columns, df_out=True)
#negative_gm = dataset[dataset['GM%'] < 0]
#dataset.shape

def time_features(date):
#     print(type(date))
    dow_x, dow_y, dom_x, dom_y, doy_x, doy_y = transform_order_date(date['Order Date'])
    date['dow_x'] = dow_x
    date['dow_y'] = dow_y
    date['dom_x'] = dom_x
    date['dom_y'] = dom_y
    date['doy_x'] = doy_x
    date['doy_y'] = doy_y
    return date #date.drop(['Order Date'], axis=1)


#xxx = FunctionTransformer(transform_order_date)
date_mapper = DataFrameMapper([(['Order Date'], FunctionTransformer(time_features))], default=None, input_df=True, df_out=True)
#dataset = date_mapper.fit_transform(dataset)

cluster_drop = DataFrameMapper([], default=None, drop_cols=[
    'Manufacturing Location Code',
    'Born on date',
    'Make vs Buy',
    'Customer First Invoice Date',
    'Born on date',
    'Make vs Buy',
    'Invoice Date',
    'Invoice Line #',
    'Price last modified date in the ERP',
    'Order Line #',
    'Invoiced qty (shipped)',
    'GM%'], input_df=True, df_out=True)

onehot_cols = [
 "Manufacturing Region",
 "Intercompany",
 "Customer industry",
 "Customer Region",
 "Top Customer Group",
 "Product family"
]

#dummies = DataFrameMapper([
#    (onehot_cols, ColumnTransformer([('1h', OneHotEncoder(sparse=False), onehot_cols)], remainder='passthrough'))
#], default=None, input_df=True, df_out=True)

dummies = DataFrameMapper([
    (onehot_cols, OneHotEncoder(sparse=False)),
], default=None, input_df=True, df_out=True)


class Dropper():
    def fit(self, *args):
        return self

    def transform(self, x):
        return x.dropna()
    
log_norm_cols = ['Ordered qty', 'Invoiced price', 'Cost of part', '# of unique products on a quote']
lognorm = DataFrameMapper([
    (log_norm_cols, [FunctionTransformer(lambda x: np.log(x.astype(np.float64) + 1_312_229)),
                     StandardScaler()])
], default=None, input_df=True, df_out=True)


lumen_pipeline = Pipeline([
    ('mapper', mapper),
    ('dropper', Dropper()),
    ('date_mapper', date_mapper),
    ('date_dropper', DataFrameMapper([], default=None, drop_cols=['Order Date_0'], input_df=True, df_out=True)),
    ('cluster_drop', cluster_drop),
    ('dummies', dummies),
    ('last', 'passthrough'),
])
lumen_pipeline.fit_transform(dataset)

cluster_pipeline = Pipeline([
    ('features', lumen_pipeline),
    ('lognorm', lognorm),
    ('dim_red', PCA(n_components=0.85)),
    
])
knee_data = cluster_pipeline.fit_transform(dataset)
knee_data


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

array([[ 0.13659871, -0.10433785,  0.13323821, ..., -0.56245843,
        -1.17467931, -0.21585016],
       [-0.4032161 ,  0.03569038, -0.23965981, ...,  1.36969647,
        -0.05103802, -0.86262225],
       [ 0.60685349, -0.06376138,  0.05125583, ..., -1.02641666,
         0.32685069, -0.97164953],
       ...,
       [ 0.11153498, -0.03958725,  0.05300394, ..., -1.20056495,
        -0.43568232, -0.88420532],
       [-0.26469834,  0.0820903 , -0.1569311 , ..., -1.31055807,
        -0.46848526, -1.01489902],
       [-0.01438592, -0.04728763,  0.14135307, ..., -0.1792898 ,
         0.41979287, -0.33618844]])

In [None]:
import pickle

rendz = range(40, 100+1)
inertia_dict = dict()
for k in rendz:
    model = KMeans(n_clusters=k)
    model.fit(knee_data)
    inertia_dict[k] = model.inertia_

with open(f'cluster-{rendz.start}-{rendz.stop}-{rendz.step}.pkl', 'wb') as file:
    pickle.dump(inertia_dict, file)