# LUMEN DataSci 2021

## Data loading

### Imports

In [4]:
import pickle
import matplotlib.pyplot as plt
import os
import time
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import pickle
from sklearn import metrics
import scipy

### Dataset

In [5]:
data_path = "LUMEN0.pkl"
with open(data_path, 'rb') as f:
    dataset = pickle.load(f)

## Data cleaning

In [6]:
# List of all the features
for feature_name in dataset.keys():
    print(feature_name)

Manufacturing Region
Manufacturing Location Code
Intercompany
CustomerID
Customer industry
Customer Region
Customer First Invoice Date
Top Customer Group
Item Code
Product family
Product group
Price last modified date in the ERP
Born on date
Make vs Buy
Sales Channel - Internal
Sales Channel - External
Sales Channel - Grouping
Invoice Date
Invoice #
Invoice Line #
Order Date
Order #
Order Line #
Invoiced qty (shipped)
Ordered qty
Invoiced price
Invoiced price (TX)
Cost of part
Material cost of part
Labor cost of part
Overhead cost of part
GM%
# of unique products on a quote


In [7]:
dataset.drop(
    ['CustomerID',
     'Item Code',
     'Invoice #',
     'Order #',
     'Invoiced price (TX)'],
    axis=1, inplace=True)

In [8]:
# Invoiced price, should be > 0
dataset.drop(dataset[dataset['Invoiced price'] <= 0].index, inplace=True)

In [9]:
# Share of NaN's per feature
na_share = dataset.isna().mean()
na_share

Manufacturing Region                   0.022154
Manufacturing Location Code            0.021698
Intercompany                           0.000000
Customer industry                      0.000000
Customer Region                        0.000073
Customer First Invoice Date            0.000000
Top Customer Group                     0.000000
Product family                         0.000000
Product group                          0.098992
Price last modified date in the ERP    0.720977
Born on date                           0.040212
Make vs Buy                            0.013767
Sales Channel - Internal               0.120557
Sales Channel - External               0.120574
Sales Channel - Grouping               0.999983
Invoice Date                           0.000000
Invoice Line #                         0.000000
Order Date                             0.000000
Order Line #                           0.037587
Invoiced qty (shipped)                 0.000000
Ordered qty                            0

`'Price last modified date in the ERP'` has a lot of missing values **(73.8%)**, since a lot of prices never changed. Therefore, we put the oldest change date to all the `NaN`s

In [10]:
column = 'Price last modified date in the ERP'
last_modified = dataset[column]
oldest = last_modified[last_modified.notna()].min()
dataset.loc[last_modified.isna(), column] = oldest
assert not dataset[column].isna().any()

Drop all the features (column) which have more than 9% of `Nan`'s

**TODO: impute them using regressor**

In [11]:
dataset.drop(na_share[na_share > 0.09].index, axis=1, inplace=True)
na_share = dataset.isna().mean()
na_share

Manufacturing Region               0.022154
Manufacturing Location Code        0.021698
Intercompany                       0.000000
Customer industry                  0.000000
Customer Region                    0.000073
Customer First Invoice Date        0.000000
Top Customer Group                 0.000000
Product family                     0.000000
Born on date                       0.040212
Make vs Buy                        0.013767
Invoice Date                       0.000000
Invoice Line #                     0.000000
Order Date                         0.000000
Order Line #                       0.037587
Invoiced qty (shipped)             0.000000
Ordered qty                        0.037587
Invoiced price                     0.007178
Cost of part                       0.017413
GM%                                0.024591
# of unique products on a quote    0.037604
dtype: float64

## Drop all rows with NaN's
**TODO: impute then using regressor**

In [12]:
dataset.dropna(inplace=True)
dataset.shape

(1101992, 20)

In [13]:
#column = 'Manufacturing Location Code'
#vc = dataset[column].value_counts()
#vc

In [14]:
#for region in ['North America', 'Asia', 'Europe']:
#    x = dataset[column].loc[dataset['Manufacturing Region'] == region].unique()
#    s = set(x)
#    print(len(s), s)

In [15]:
#import plotly.express as px
#column = 'Manufacturing Location Code'
#df = dataset[[column, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.box(df, x=column, y='GM%')
#fig.show()
#df

In [16]:
from datetime import datetime
import calendar

def transform_order_date(order_date):
    order_date = order_date.apply(datetime.fromisoformat)
    day_of_week = []
    day_of_month = []
    day_of_year = []
    for date in order_date:
        dow = date.weekday()
        date_day = date.day-1
        doy = date.timetuple().tm_yday - 1 # starts from 1
        month_length = calendar.monthrange(date.year, date.month)[1]
        year_length = 365 if not calendar.isleap(date.year) else 366
        day_of_week.append(2*np.pi*dow / 7)
        day_of_month.append(2*np.pi*date_day / month_length)
        day_of_year.append(2*np.pi*doy / year_length)
    dow = np.array(day_of_week)
    dom = np.array(day_of_month)
    doy = np.array(day_of_year)
    return np.cos(dow), np.sin(dow), np.cos(dom), np.sin(dom), np.cos(doy), np.sin(doy)

In [17]:
#date = dataset['Order Date']
#print(date)
#print(date)
#dow_x, dow_y, dom_x, dom_y, doy_x, doy_y = transform_order_date(date)
#dataset['dow_x'] = dow_x
#dataset['dow_y'] = dow_y
#dataset['dom_x'] = dom_x
#dataset['dom_y'] = dom_y
#dataset['doy_x'] = doy_x
#dataset['doy_y'] = doy_y
#dataset.drop(['Order Date'], axis=1, inplace=True)
#dataset.shape

In [18]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(df, x=column)
#fig.show()


In [19]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(dataset, x='GM%', log_y=True)
#fig.show()

In [20]:
# List of all the features
# for feature_name in dataset.keys():
#     print(feature_name, dataset[feature_name].dtype)
# df = pd.get_dummies(dataset, 
#                     columns=['Manufacturing Region', 'Intercompany', 'Customer industry', 'Product family'])

In [21]:
df=dataset
df.columns

Index(['Manufacturing Region', 'Manufacturing Location Code', 'Intercompany',
       'Customer industry', 'Customer Region', 'Customer First Invoice Date',
       'Top Customer Group', 'Product family', 'Born on date', 'Make vs Buy',
       'Invoice Date', 'Invoice Line #', 'Order Date', 'Order Line #',
       'Invoiced qty (shipped)', 'Ordered qty', 'Invoiced price',
       'Cost of part', 'GM%', '# of unique products on a quote'],
      dtype='object')

In [22]:
a = [
    'Manufacturing Region',
    'Intercompany',
    'Customer industry',
    'Customer Region',
    'Top Customer Group',
    'Product family',
    'Make vs Buy'
]
df.drop(a, axis=1, inplace=True)

In [23]:
# remove extra features for clustering
#dataset.drop(
#    ['Manufacturing Location Code',
#     'Intercompany',
#     'Invoice #',
#     'Order #',
#     'Invoiced price (TX)'],
#    axis=1, inplace=True)

cluster_df = dataset.drop(
    ['Manufacturing Location Code',
     'Born on date',
     'Customer First Invoice Date',
     'Born on date',
     'Invoice Date',
     'Invoice Line #',
     'Order Date',
     'Order Line #',
     'Invoiced qty (shipped)'],
    axis=1, inplace=False)

In [24]:
# cluster_df = pd.get_dummies(cluster_df)
# cluster_df

In [25]:
noto = (cluster_df['GM%'] > 1) | (cluster_df['GM%'] < -1)
cluster_df = cluster_df[~noto]
for feature_name in ['Ordered qty', 'Invoiced price', 'Cost of part']:
    cluster_df.drop(index=cluster_df[(cluster_df[feature_name] <= 0)].index, inplace=True)
    feature = cluster_df[feature_name]
    feature = np.log(feature)
    feature = (feature - feature.mean()) / feature.std()
    cluster_df[feature_name] = feature
cluster_df

Unnamed: 0,Ordered qty,Invoiced price,Cost of part,GM%,# of unique products on a quote
0,0.026648,-0.390952,-0.193411,-0.070000,1.0
1,0.248449,-0.733552,-1.137708,0.816429,3.0
11,-0.984192,1.768995,1.718663,0.526436,1.0
12,-0.984192,1.765536,1.718663,0.521512,1.0
16,0.388283,0.248308,0.253083,0.409060,1.0
...,...,...,...,...,...
1294956,-0.849796,1.125236,1.157479,0.380507,5.0
1294958,-1.091550,0.574651,0.005208,0.892857,1.0
1294959,0.983135,-0.964391,-0.947931,0.358974,1.0
1294960,0.598983,0.151852,0.145153,0.426606,1.0


In [26]:
1-len(cluster_df) / len(dataset)

0.15541038410442176

In [27]:
#import plotly.express as px
#column = 'Invoiced price'

#fig = px.histogram(df, x=column, log_y=True)
#fig.show()

In [None]:
from sklearn.cluster import OPTICS

cluster = OPTICS(n_jobs=-1,max_eps=1)
cluster.fit(cluster_df)

In [None]:
clustering.labels_