<a href="https://colab.research.google.com/github/zsucicdl/lumen/blob/master/lumen_msmetko.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LUMEN DataSci 2021

## Data loading

### Imports

In [1]:
import pandas as pd
import numpy as np
import scipy

### Dataset

In [None]:
dataset = pd.read_csv('data/LUMEN_DS.csv', sep='|', quotechar='"', encoding='UTF-16LE')
#negative_gm = dataset[dataset['GM%'] < 0]
dataset.shape

## Data cleaning

In [None]:
# List of all the features
for feature_name in dataset.keys():
    print(feature_name)

In [None]:
dataset.drop(
    ['CustomerID',
     'Item Code',
     'Invoice #',
     'Order #',
     'Invoiced price (TX)'],
    axis=1, inplace=True)

In [None]:
# Invoiced price, should be > 0
dataset.drop(dataset[dataset['Invoiced price'] <= 0].index, inplace=True)

In [None]:
# Share of NaN's per feature
na_share = dataset.isna().mean()
na_share

`'Price last modified date in the ERP'` has a lot of missing values **(73.8%)**, since a lot of prices never changed. Therefore, we put the oldest change date to all the `NaN`s

In [None]:
column = 'Price last modified date in the ERP'
last_modified = dataset[column]
oldest = last_modified[last_modified.notna()].min()
dataset.loc[last_modified.isna(), column] = oldest
assert not dataset[column].isna().any()

Drop all the features (column) which have more than 9% of `Nan`'s

**TODO: impute them using regressor**

In [None]:
dataset.drop(na_share[na_share > 0.09].index, axis=1, inplace=True)
na_share = dataset.isna().mean()
na_share

## Drop all rows with NaN's
**TODO: impute then using regressor**

In [None]:
dataset.dropna(inplace=True)
dataset.shape

In [None]:
#column = 'Manufacturing Location Code'
#vc = dataset[column].value_counts()
#vc

In [None]:
#for region in ['North America', 'Asia', 'Europe']:
#    x = dataset[column].loc[dataset['Manufacturing Region'] == region].unique()
#    s = set(x)
#    print(len(s), s)

In [None]:
#import plotly.express as px
#column = 'Manufacturing Location Code'
#df = dataset[[column, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.box(df, x=column, y='GM%')
#fig.show()
#df

In [None]:
from datetime import datetime
import calendar

def transform_order_date(order_date):
    order_date = order_date.apply(datetime.fromisoformat)
    day_of_week = []
    day_of_month = []
    day_of_year = []
    for date in order_date:
        dow = date.weekday()
        date_day = date.day-1
        doy = date.timetuple().tm_yday - 1 # starts from 1
        month_length = calendar.monthrange(date.year, date.month)[1]
        year_length = 365 if not calendar.isleap(date.year) else 366
        day_of_week.append(2*np.pi*dow / 7)
        day_of_month.append(2*np.pi*date_day / month_length)
        day_of_year.append(2*np.pi*doy / year_length)
    dow = np.array(day_of_week)
    dom = np.array(day_of_month)
    doy = np.array(day_of_year)
    return np.cos(dow), np.sin(dow), np.cos(dom), np.sin(dom), np.cos(doy), np.sin(doy)

In [None]:
date = dataset['Order Date']
#print(date)
#print(date)
dow_x, dow_y, dom_x, dom_y, doy_x, doy_y = transform_order_date(date)
dataset['dow_x'] = dow_x
dataset['dow_y'] = dow_y
dataset['dom_x'] = dom_x
dataset['dom_y'] = dom_y
dataset['doy_x'] = doy_x
dataset['doy_y'] = doy_y
dataset.drop(['Order Date'], axis=1, inplace=True)
dataset.shape

In [None]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(df, x=column)
#fig.show()


In [None]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(dataset, x='GM%', log_y=True)
#fig.show()

In [None]:
# List of all the features
#for feature_name in dataset.keys():
#    print(feature_name, dataset[feature_name].dtype)
#df = pd.get_dummies(dataset, 
#                    columns=['Manufacturing Region', 'Intercompany', 'Customer industry', 'Product family'])

In [None]:
# remove extra features for clustering
#dataset.drop(
#    ['Manufacturing Location Code',
#     'Intercompany',
#     'Invoice #',
#     'Order #',
#     'Invoiced price (TX)'],
#    axis=1, inplace=True)
cluster_df = dataset.drop(
    ['Manufacturing Location Code',
     'Born on date',
     'Make vs Buy',
     'Customer First Invoice Date',
     'Born on date',
     'Make vs Buy',
     'Invoice Date',
     'Invoice Line #',
     #'Order Date',
     'Order Line #',
     'Invoiced qty (shipped)'],
    axis=1, inplace=False)

In [None]:
cluster_df = pd.get_dummies(cluster_df)
cluster_df

In [None]:
noto = (cluster_df['GM%'] > 1) | (cluster_df['GM%'] < -1)
cluster_df = cluster_df[~noto]
for feature_name in ['Ordered qty', 'Invoiced price', 'Cost of part', '# of unique products on a quote']:
    cluster_df.drop(index=cluster_df[(cluster_df[feature_name] <= 0)].index, inplace=True)
    feature = cluster_df[feature_name]
    feature = np.log(feature)
    feature = (feature - feature.mean()) / feature.std()
    cluster_df.loc[:, feature_name] = feature
cluster_df

In [None]:
1-len(cluster_df) / len(dataset)

In [None]:
#import plotly.express as px
#column = 'Invoiced price'

#fig = px.histogram(cluster_df['# of unique products on a quote'])
#fig.show()

In [None]:
#from sklearn.cluster import OPTICS

#cluster = OPTICS()
#cluster.fit(cluster_df)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.85)
pca.fit(cluster_df.values)

In [None]:
#fig = px.line(np.cumsum(pca.explained_variance_ratio_), labels={'index': '# of components', 'value': 'Variance explained'}, title='PCA dimensionality reduction')
#fig.show()

In [None]:
dim_red = pca.transform(cluster_df.values)
dim_red.shape

In [None]:
#from sklearn.cluster import KMeans

#kmeans = KMeans(n_clusters=5)
#kmeans.fit(dim_red)

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
import pickle
import tempfile

for k in range(20, 180+1, 20):
    rendz = range(k, k+20)
    for _ in range(3):
        kmeans = KMeans()
        visualizer = KElbowVisualizer(kmeans, k=rendz)
        visualizer.fit(dim_red)
        with tempfile.NamedTemporaryFile(prefix=f'cluster-{k}-{k+20}', suffix='.pkl', delete=False, dir='/home/jupyter-msmetko') as file:
            pickle.dump({'range': rendz, 'values': visualizer.k_scores_}, file)
#visualizer.show()

In [None]:
#from sklearn.cluster import DBSCAN

#cluster = DBSCAN(eps=0.2, metric='l1', leaf_size=200, n_jobs=-1)
#indices = np.random.choice(dim_red.shape[0], size=100_000, replace=False)
#cluster.fit(dim_red)

In [None]:
#with tempfile.NamedTemporaryFile(prefix='cluster', suffix='.pkl', delete=False, dir='.') as file:
#    pickle.dump({'range': rendz, 'values': visualizer.k_scores_}, file)